From 02330cec696ed2636942b22837c146263f09e552 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Fri, 11 Jan 2019 12:25:34 +0100 Subject: [PATCH] Fix PrimaryAllocationIT Race Condition * Forcing a stale primary allocation on a green index was tripping the assertion that was removed * Added a test that this case still errors out correctly * Made the ability to wipe stopped datanode's data public on the internal test cluster and used it to ensure correct behaviour on the fixed test * Previously it simply passed because the test finished before the index went green and would NPE when the index was green at the time of the shard store status request, that would then come up empty * Closes #37345 --- .../TransportClusterRerouteAction.java | 6 ++++- .../cluster/routing/PrimaryAllocationIT.java | 25 +++++++++++++++++-- .../test/InternalTestCluster.java | 3 +-- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java index d5cc35b2205ac..7f29f0bb6db8b 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/reroute/TransportClusterRerouteAction.java @@ -113,7 +113,11 @@ private void verifyThenSubmitUpdate(ClusterRerouteRequest request, ActionListene for (Map.Entry> entry : stalePrimaryAllocations.entrySet()) { final String index = entry.getKey(); final ImmutableOpenIntMap> indexStatus = status.get(index); - assert indexStatus != null; + if (indexStatus == null) { + // The index in the stale primary allocation request was green and hence filtered out by the store status + // request. We ignore it here since the relevant exception will be thrown by the reroute action later on. + continue; + } for (AbstractAllocateAllocationCommand command : entry.getValue()) { final List shardStatus = indexStatus.get(command.shardId()); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryAllocationIT.java b/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryAllocationIT.java index 3f826c587e683..aa86780cb31e8 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryAllocationIT.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/PrimaryAllocationIT.java @@ -265,7 +265,6 @@ public void testForceStaleReplicaToBePromotedToPrimary() throws Exception { assertThat(newHistoryUUIds, hasSize(1)); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/37345") public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Exception { String master = internalCluster().startMasterOnlyNode(Settings.EMPTY); internalCluster().startDataOnlyNodes(2); @@ -275,7 +274,10 @@ public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Excep .put("index.number_of_replicas", 1)).get()); ensureGreen(); createStaleReplicaScenario(master); - internalCluster().startDataOnlyNodes(2); + // Ensure the stopped primary's data is deleted so that it doesn't get picked up by the next datanode we start + internalCluster().wipePendingDataDirectories(); + internalCluster().startDataOnlyNodes(1); + ensureStableCluster(3, master); final int shardId = 0; final List nodeNames = new ArrayList<>(Arrays.asList(internalCluster().getNodeNames())); nodeNames.remove(master); @@ -292,6 +294,25 @@ public void testForceStaleReplicaToBePromotedToPrimaryOnWrongNode() throws Excep equalTo("No data for shard [" + shardId + "] of index [" + idxName + "] found on node [" + nodeWithoutData + ']')); } + public void testForceStaleReplicaToBePromotedForGreenIndex() { + internalCluster().startMasterOnlyNode(Settings.EMPTY); + final List dataNodes = internalCluster().startDataOnlyNodes(2); + final String idxName = "test"; + assertAcked(client().admin().indices().prepareCreate(idxName) + .setSettings(Settings.builder().put("index.number_of_shards", 1) + .put("index.number_of_replicas", 1)).get()); + ensureGreen(); + final String nodeWithoutData = randomFrom(dataNodes); + final int shardId = 0; + Throwable iae = expectThrows( + IllegalArgumentException.class, + () -> client().admin().cluster().prepareReroute() + .add(new AllocateStalePrimaryAllocationCommand(idxName, shardId, nodeWithoutData, true)).get()); + assertThat( + iae.getMessage(), + equalTo("[allocate_stale_primary] primary [" + idxName+ "][" + shardId + "] is already assigned")); + } + public void testForceStaleReplicaToBePromotedForMissingIndex() { internalCluster().startMasterOnlyNode(Settings.EMPTY); final String dataNode = internalCluster().startDataOnlyNode(); diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java index e6e11dacb749f..e4a11ad414ffe 100644 --- a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java +++ b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java @@ -1398,8 +1398,7 @@ private void randomlyResetClients() { } } - private void wipePendingDataDirectories() { - assert Thread.holdsLock(this); + public synchronized void wipePendingDataDirectories() { if (!dataDirToClean.isEmpty()) { try { for (Path path : dataDirToClean) {