From 35694c3dae4211a116a757d9d216098beea93923 Mon Sep 17 00:00:00 2001 From: Alexander Dejanovski Date: Tue, 6 Feb 2018 09:15:16 +0100 Subject: [PATCH] Handle cases where a segment reaches timeout without being in either RUNNING or DONE state. So far, it would hang the repair until Reaper was restarted. Now, segments will be aborted and rescheduled. --- .../io/cassandrareaper/service/SegmentRunner.java | 14 +++++++++++++- .../cassandrareaper/service/SegmentRunnerTest.java | 5 +---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/server/src/main/java/io/cassandrareaper/service/SegmentRunner.java b/src/server/src/main/java/io/cassandrareaper/service/SegmentRunner.java index 6212fc390..dad46aa47 100644 --- a/src/server/src/main/java/io/cassandrareaper/service/SegmentRunner.java +++ b/src/server/src/main/java/io/cassandrareaper/service/SegmentRunner.java @@ -361,6 +361,16 @@ private void processTriggeredSegment(final RepairSegment segment, final JmxProxy Seconds.secondsBetween(resultingSegment.getStartTime(), resultingSegment.getEndTime()).getSeconds()); SEGMENT_RUNNERS.remove(resultingSegment.getId()); + } else { + // Something went wrong on the coordinator node and we never got the RUNNING notification + // or we are in an undetermined state. + // Let's just abort and reschedule the segment. + LOG.info( + "Repair command {} on segment {} never managed to start within timeout.", + commandId, + segmentId); + segmentFailed.set(true); + abort(resultingSegment, coordinator); } // Repair is still running, we'll renew lead on the segment when using Cassandra as storage backend renewLead(); @@ -835,7 +845,9 @@ private boolean handleJmxNotificationForCassandra21( // This gets called through the JMX proxy at the end // regardless of succeeded or failed sessions. LOG.debug( - "repair session finished for segment with id '{}' and repair number '{}'", segmentId, repairNumber); + "repair session finished for segment with id '{}' and repair number '{}'", + segmentId, + repairNumber); condition.signalAll(); break; default: diff --git a/src/server/src/test/java/io/cassandrareaper/service/SegmentRunnerTest.java b/src/server/src/test/java/io/cassandrareaper/service/SegmentRunnerTest.java index 25a598399..bb0b08f5c 100644 --- a/src/server/src/test/java/io/cassandrareaper/service/SegmentRunnerTest.java +++ b/src/server/src/test/java/io/cassandrareaper/service/SegmentRunnerTest.java @@ -24,9 +24,6 @@ import io.cassandrareaper.jmx.JmxConnectionFactory; import io.cassandrareaper.jmx.JmxProxy; import io.cassandrareaper.jmx.RepairStatusHandler; -import io.cassandrareaper.service.RepairRunner; -import io.cassandrareaper.service.RingRange; -import io.cassandrareaper.service.SegmentRunner; import io.cassandrareaper.storage.IStorage; import io.cassandrareaper.storage.MemoryStorage; @@ -388,7 +385,7 @@ protected JmxProxy connect(final Optional handler, String h executor.shutdown(); assertEquals(RepairSegment.State.NOT_STARTED, storage.getRepairSegment(runId, segmentId).get().getState()); - assertEquals(1, storage.getRepairSegment(runId, segmentId).get().getFailCount()); + assertEquals(2, storage.getRepairSegment(runId, segmentId).get().getFailCount()); } @Test