From 64b786cbd81ad25b65e56872862ba75cf7672663 Mon Sep 17 00:00:00 2001 From: Sebastian Marsching Date: Sat, 15 Feb 2020 13:33:23 +0100 Subject: [PATCH] Treat started segments as running when aborting. On startup (or periodically in case of the Cassandra storage), the RepairManager aborts segments that do not have a leader or that belong to paused repair runs. However, this code would only abort segments in the RUNNING state. This is a problem because segments might also be left in the STARTED state when a Cassandra Reaper instance is restarted. Such segments would block other segments from running (because when checking for busy hosts, STARTED segments are treated like RUNNING segments), but would never finish or be restarted, thus effectively blocking the repair process. This patch fixes this problem by treating RUNNING and STARTED segments in the same way when aborting segments that do not have a leader or that belong to paused repair runs. --- .../io/cassandrareaper/service/RepairManager.java | 13 ++++++++++--- .../cassandrareaper/service/RepairManagerTest.java | 6 +++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/server/src/main/java/io/cassandrareaper/service/RepairManager.java b/src/server/src/main/java/io/cassandrareaper/service/RepairManager.java index 7abdd145f..06bf0ebc1 100644 --- a/src/server/src/main/java/io/cassandrareaper/service/RepairManager.java +++ b/src/server/src/main/java/io/cassandrareaper/service/RepairManager.java @@ -157,8 +157,11 @@ private void abortAllRunningSegmentsWithNoLeader(Collection runningRe .forEach((repairRun) -> { Collection runningSegments = context.storage.getSegmentsWithState(repairRun.getId(), RepairSegment.State.RUNNING); + Collection startedSegments + = context.storage.getSegmentsWithState(repairRun.getId(), RepairSegment.State.STARTED); abortSegmentsWithNoLeader(repairRun, runningSegments); + abortSegmentsWithNoLeader(repairRun, startedSegments); }); } @@ -187,11 +190,14 @@ private void abortAllRunningSegmentsInKnownPausedRepairRuns(Collection repairRunners.containsKey(pausedRepairRun.getId())) .forEach((pausedRepairRun) -> { - // Abort all running segments for paused repair runs + // Abort all running and started segments for paused repair runs Collection runningSegments = context.storage.getSegmentsWithState(pausedRepairRun.getId(), RepairSegment.State.RUNNING); + Collection startedSegments + = context.storage.getSegmentsWithState(pausedRepairRun.getId(), RepairSegment.State.STARTED); abortSegments(runningSegments, pausedRepairRun); + abortSegments(startedSegments, pausedRepairRun); }); } finally { repairRunnersLock.unlock(); @@ -223,7 +229,7 @@ private void abortSegmentsWithNoLeader(RepairRun repairRun, Collection leaders = context.storage instanceof IDistributedStorage ? ((IDistributedStorage) context.storage).getLeaders() : Collections.emptyList(); @@ -276,7 +282,8 @@ void abortSegments(Collection runningSegments, RepairRun repairRu try { // refresh segment once we're inside leader-election segment = context.storage.getRepairSegment(repairRun.getId(), segment.getId()).get(); - if (RepairSegment.State.RUNNING == segment.getState()) { + if (RepairSegment.State.RUNNING == segment.getState() + || RepairSegment.State.STARTED == segment.getState()) { JmxProxy jmxProxy = ClusterFacade.create(context).connect( context.storage.getCluster(repairRun.getClusterName()), Arrays.asList(segment.getCoordinatorHost())); diff --git a/src/server/src/test/java/io/cassandrareaper/service/RepairManagerTest.java b/src/server/src/test/java/io/cassandrareaper/service/RepairManagerTest.java index 7b16399cd..d93fde320 100644 --- a/src/server/src/test/java/io/cassandrareaper/service/RepairManagerTest.java +++ b/src/server/src/test/java/io/cassandrareaper/service/RepairManagerTest.java @@ -125,7 +125,7 @@ public void abortRunningSegmentWithNoLeader() throws ReaperException, Interrupte context.repairManager.resumeRunningRepairRuns(); // Check that abortSegments was invoked is at least one segment, meaning abortion occurs - Mockito.verify(context.repairManager, Mockito.times(1)).abortSegments(Mockito.argThat(new NotEmptyList()), any()); + Mockito.verify(context.repairManager, Mockito.times(2)).abortSegments(Mockito.argThat(new NotEmptyList()), any()); } /** @@ -202,7 +202,7 @@ public void doNotAbortRunningSegmentWithLeader() throws ReaperException, Interru context.repairManager.resumeRunningRepairRuns(); // Check that abortSegments was invoked with an empty list, meaning no abortion occurs - Mockito.verify(context.repairManager, Mockito.times(1)).abortSegments(Mockito.argThat(new EmptyList()), any()); + Mockito.verify(context.repairManager, Mockito.times(2)).abortSegments(Mockito.argThat(new EmptyList()), any()); } /** @@ -349,7 +349,7 @@ public void abortRunningSegmentWithNoRepairRunnerAndNoDistributedStorage() context.repairManager.resumeRunningRepairRuns(); // Check that abortSegments was invoked with an non empty list, meaning abortion occurs - Mockito.verify(context.repairManager, Mockito.times(1)).abortSegments(Mockito.argThat(new NotEmptyList()), any()); + Mockito.verify(context.repairManager, Mockito.times(2)).abortSegments(Mockito.argThat(new NotEmptyList()), any()); } @Test