[ML] Reduce persistent tasks periodic reassignment interval in ...

... MlDistributedFailureIT.testLoseDedicatedMasterNode. An intermittent failure has been observed in `MlDistributedFailureIT. testLoseDedicatedMasterNode`. The test launches a cluster comprised by a dedicated master node and a data and ML node. It creates a job and datafeed and starts them. It then shuts down and restarts the master node. Finally, the test asserts that the two tasks have been reassigned within 10s. The intermittent failure is due to the assertions that the tasks have been reassigned failing. Investigating the failure revealed that the `assertBusy` that performs that assertion times out. Furthermore, it appears that the job task is not reassigned because the memory tracking info is stale. Memory tracking info is refreshed asynchronously when a job is attempted to be reassigned. Tasks are attempted to be reassigned either due to a relevant cluster state change or periodically. The periodic interval is controlled by a cluster setting called `cluster.persistent_tasks.allocation.recheck_interval` and defaults to 30s. What seems to be happening in this test is that if all cluster state changes after the master node is restarted come through before the async memory info refresh completes, then the job might take up to 30s until it is attempted to reassigned. Thus the `assertBusy` times out. This commit changes the test to set `cluster.persistent_tasks.allocation.recheck_interval` to 1s. If the above theory is correct, this should eradicate those failures. Closes elastic#36760
dimitris-athanasiou · Dec 19, 2018 · 74a8187 · 74a8187
1 parent d31eaf7
commit 74a8187
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 2 deletions.
diff --git a/server/src/main/java/org/elasticsearch/persistent/PersistentTasksClusterService.java b/server/src/main/java/org/elasticsearch/persistent/PersistentTasksClusterService.java
@@ -51,7 +51,7 @@ public class PersistentTasksClusterService implements ClusterStateListener, Clos
 
     public static final Setting<TimeValue> CLUSTER_TASKS_ALLOCATION_RECHECK_INTERVAL_SETTING =
         Setting.timeSetting("cluster.persistent_tasks.allocation.recheck_interval", TimeValue.timeValueSeconds(30),
-            TimeValue.timeValueSeconds(10), Setting.Property.Dynamic, Setting.Property.NodeScope);
+            TimeValue.timeValueSeconds(1), Setting.Property.Dynamic, Setting.Property.NodeScope);
 
     private static final Logger logger = LogManager.getLogger(PersistentTasksClusterService.class);
 

diff --git a/...lugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java b/...lugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/MlDistributedFailureIT.java
@@ -15,12 +15,14 @@
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.DeprecationHandler;
 import org.elasticsearch.common.xcontent.NamedXContentRegistry;
 import org.elasticsearch.common.xcontent.XContentHelper;
 import org.elasticsearch.common.xcontent.XContentParser;
 import org.elasticsearch.common.xcontent.XContentType;
 import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.persistent.PersistentTasksClusterService;
 import org.elasticsearch.persistent.PersistentTasksCustomMetaData;
 import org.elasticsearch.persistent.PersistentTasksCustomMetaData.PersistentTask;
 import org.elasticsearch.test.junit.annotations.TestLogging;
@@ -57,6 +59,7 @@ public class MlDistributedFailureIT extends BaseMlIntegTestCase {
     protected Settings nodeSettings(int nodeOrdinal) {
         return Settings.builder().put(super.nodeSettings(nodeOrdinal))
             .put(MachineLearning.CONCURRENT_JOB_ALLOCATIONS.getKey(), 4)
+            .put(PersistentTasksClusterService.CLUSTER_TASKS_ALLOCATION_RECHECK_INTERVAL_SETTING.getKey(), TimeValue.timeValueSeconds(1))
             .build();
     }
 
@@ -72,7 +75,6 @@ public void testFailOver() throws Exception {
         });
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/32905")
     public void testLoseDedicatedMasterNode() throws Exception {
         internalCluster().ensureAtMostNumDataNodes(0);
         logger.info("Starting dedicated master node...");