Skip to content

Commit d2ce932

Browse files
committed
add debug for hanging segments
* add one more repeated check for JMX connection
1 parent 84ed30a commit d2ce932

File tree

1 file changed

+27
-16
lines changed

1 file changed

+27
-16
lines changed

src/main/java/com/spotify/reaper/service/RepairRunner.java

+27-16
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ public void run() {
160160
context.repairManager.scheduleRetry(this);
161161
break;
162162
}
163-
} catch (RuntimeException e) {
163+
} catch (RuntimeException | ReaperException e) {
164164
LOG.error("RepairRun FAILURE, scheduling retry");
165165
LOG.error(e.toString());
166166
LOG.error(Arrays.toString(e.getStackTrace()));
@@ -173,7 +173,7 @@ public void run() {
173173
/**
174174
* Starts the repair run.
175175
*/
176-
private void start() {
176+
private void start() throws ReaperException {
177177
LOG.info("Repairs for repair run #{} starting", repairRunId);
178178
synchronized (this) {
179179
RepairRun repairRun = context.storage.getRepairRun(repairRunId).get();
@@ -198,13 +198,24 @@ private void endRepairRun() {
198198
}
199199
}
200200

201+
private void confirmJMXConnectionIsOpen() throws ReaperException {
202+
if (jmxConnection == null || !jmxConnection.isConnectionAlive()) {
203+
LOG.debug("connecting JMX proxy for repair runner on run id: {}", repairRunId);
204+
Cluster cluster = context.storage.getCluster(this.clusterName).get();
205+
jmxConnection = context.jmxConnectionFactory.connectAny(cluster);
206+
LOG.debug("successfully reestablished JMX proxy for repair runner");
207+
}
208+
}
209+
201210
/**
202211
* Get the next segment and repair it. If there is none, we're done.
203212
*/
204-
private void startNextSegment() {
213+
private void startNextSegment() throws ReaperException {
205214
boolean scheduleRetry = true;
206215
boolean anythingRunningStill = false;
207216

217+
confirmJMXConnectionIsOpen();
218+
208219
// We want to know whether a repair was started,
209220
// so that a rescheduling of this runner will happen.
210221
boolean repairStarted = false;
@@ -221,6 +232,12 @@ private void startNextSegment() {
221232
if (startTime != null && startTime.isBefore(DateTime.now().minusDays(1))) {
222233
LOG.warn("Looks like segment #{} has been running more than a day. Start time: {}",
223234
supposedlyRunningSegment.getId(), supposedlyRunningSegment.getStartTime());
235+
} else if (startTime != null && startTime.isBefore(DateTime.now().minusHours(1))) {
236+
LOG.info("Looks like segment #{} has been running more than an hour. Start time: {}",
237+
supposedlyRunningSegment.getId(), supposedlyRunningSegment.getStartTime());
238+
} else if (startTime != null && startTime.isBefore(DateTime.now().minusMinutes(2))) {
239+
LOG.debug("Looks like segment #{} has been running more than two minutes. Start time: {}",
240+
supposedlyRunningSegment.getId(), supposedlyRunningSegment.getStartTime());
224241
}
225242
// No need to try starting new repair for already active slot.
226243
continue;
@@ -287,19 +304,13 @@ private boolean repairSegment(final int rangeIndex, final long segmentId, RingRa
287304
String keyspace = repairUnit.getKeyspaceName();
288305
LOG.debug("preparing to repair segment {} on run with id {}", segmentId, repairRunId);
289306

290-
if (jmxConnection == null || !jmxConnection.isConnectionAlive()) {
291-
try {
292-
LOG.debug("connecting JMX proxy for repair runner on run id: {}", repairRunId);
293-
Cluster cluster = context.storage.getCluster(repairUnit.getClusterName()).get();
294-
jmxConnection = context.jmxConnectionFactory.connectAny(cluster);
295-
} catch (ReaperException e) {
296-
e.printStackTrace();
297-
LOG.warn("Failed to reestablish JMX connection in runner #{}, retrying", repairRunId);
298-
currentlyRunningSegments.set(rangeIndex, -1);
299-
return true;
300-
}
301-
LOG.debug("successfully reestablished JMX proxy for repair runner on run id: {}",
302-
repairRunId);
307+
try {
308+
confirmJMXConnectionIsOpen();
309+
} catch (ReaperException e) {
310+
e.printStackTrace();
311+
LOG.warn("Failed to reestablish JMX connection in runner #{}, retrying", repairRunId);
312+
currentlyRunningSegments.set(rangeIndex, -1);
313+
return true;
303314
}
304315

305316
List<String> potentialCoordinators;

0 commit comments

Comments
 (0)