Skip to content

Commit

Permalink
[ML] JIindex: Limit the size of bulk migrations (#36481)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidkyle committed Dec 17, 2018
1 parent cbe9099 commit a842453
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.elasticsearch.xpack.core.ClientHelper.ML_ORIGIN;
Expand All @@ -68,14 +70,19 @@
* If there was an error in step 3 and the config is in both the clusterstate and
* index then when the migrator retries it must not overwrite an existing job config
* document as once the index document is present all update operations will function
* on that rather than the clusterstate
* on that rather than the clusterstate.
*
* The number of configs indexed in each bulk operation is limited by {@link #MAX_BULK_WRITE_SIZE}
* pairs of datafeeds and jobs are migrated together.
*/
public class MlConfigMigrator {

private static final Logger logger = LogManager.getLogger(MlConfigMigrator.class);

public static final String MIGRATED_FROM_VERSION = "migrated from version";

static final int MAX_BULK_WRITE_SIZE = 100;

private final Client client;
private final ClusterService clusterService;

Expand Down Expand Up @@ -111,10 +118,12 @@ public void migrateConfigsWithoutTasks(ClusterState clusterState, ActionListener
return;
}

Collection<DatafeedConfig> datafeedsToMigrate = stoppedDatafeedConfigs(clusterState);
List<Job> jobsToMigrate = nonDeletingJobs(closedJobConfigs(clusterState)).stream()
Collection<DatafeedConfig> stoppedDatafeeds = stoppedDatafeedConfigs(clusterState);
Map<String, Job> eligibleJobs = nonDeletingJobs(closedJobConfigs(clusterState)).stream()
.map(MlConfigMigrator::updateJobForMigration)
.collect(Collectors.toList());
.collect(Collectors.toMap(Job::getId, Function.identity(), (a, b) -> a));

JobsAndDatafeeds jobsAndDatafeedsToMigrate = limitWrites(stoppedDatafeeds, eligibleJobs);

ActionListener<Boolean> unMarkMigrationInProgress = ActionListener.wrap(
response -> {
Expand All @@ -127,16 +136,18 @@ public void migrateConfigsWithoutTasks(ClusterState clusterState, ActionListener
}
);

if (datafeedsToMigrate.isEmpty() && jobsToMigrate.isEmpty()) {
if (jobsAndDatafeedsToMigrate.totalCount() == 0) {
unMarkMigrationInProgress.onResponse(Boolean.FALSE);
return;
}

writeConfigToIndex(datafeedsToMigrate, jobsToMigrate, ActionListener.wrap(
logger.debug("migrating ml configurations");

writeConfigToIndex(jobsAndDatafeedsToMigrate.datafeedConfigs, jobsAndDatafeedsToMigrate.jobs, ActionListener.wrap(
failedDocumentIds -> {
List<String> successfulJobWrites = filterFailedJobConfigWrites(failedDocumentIds, jobsToMigrate);
List<String> successfulJobWrites = filterFailedJobConfigWrites(failedDocumentIds, jobsAndDatafeedsToMigrate.jobs);
List<String> successfulDatafeedWrites =
filterFailedDatafeedConfigWrites(failedDocumentIds, datafeedsToMigrate);
filterFailedDatafeedConfigWrites(failedDocumentIds, jobsAndDatafeedsToMigrate.datafeedConfigs);
removeFromClusterState(successfulJobWrites, successfulDatafeedWrites, unMarkMigrationInProgress);
},
unMarkMigrationInProgress::onFailure
Expand Down Expand Up @@ -341,6 +352,62 @@ public static List<DatafeedConfig> stoppedDatafeedConfigs(ClusterState clusterSt
.collect(Collectors.toList());
}

public static class JobsAndDatafeeds {
List<Job> jobs;
List<DatafeedConfig> datafeedConfigs;

private JobsAndDatafeeds() {
jobs = new ArrayList<>();
datafeedConfigs = new ArrayList<>();
}

public int totalCount() {
return jobs.size() + datafeedConfigs.size();
}
}

/**
* Return at most {@link #MAX_BULK_WRITE_SIZE} configs favouring
* datafeed and job pairs so if a datafeed is chosen so is its job.
*
* @param datafeedsToMigrate Datafeed configs
* @param jobsToMigrate Job configs
* @return Job and datafeed configs
*/
public static JobsAndDatafeeds limitWrites(Collection<DatafeedConfig> datafeedsToMigrate, Map<String, Job> jobsToMigrate) {
JobsAndDatafeeds jobsAndDatafeeds = new JobsAndDatafeeds();

if (datafeedsToMigrate.size() + jobsToMigrate.size() <= MAX_BULK_WRITE_SIZE) {
jobsAndDatafeeds.jobs.addAll(jobsToMigrate.values());
jobsAndDatafeeds.datafeedConfigs.addAll(datafeedsToMigrate);
return jobsAndDatafeeds;
}

int count = 0;

// prioritise datafeed and job pairs
for (DatafeedConfig datafeedConfig : datafeedsToMigrate) {
if (count < MAX_BULK_WRITE_SIZE) {
jobsAndDatafeeds.datafeedConfigs.add(datafeedConfig);
count++;
Job datafeedsJob = jobsToMigrate.remove(datafeedConfig.getJobId());
if (datafeedsJob != null) {
jobsAndDatafeeds.jobs.add(datafeedsJob);
count++;
}
}
}

// are there jobs without datafeeds to migrate
Iterator<Job> iter = jobsToMigrate.values().iterator();
while (iter.hasNext() && count < MAX_BULK_WRITE_SIZE) {
jobsAndDatafeeds.jobs.add(iter.next());
count++;
}

return jobsAndDatafeeds;
}

/**
* Check for failures in the bulk response and return the
* Ids of any documents not written to the index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.containsInAnyOrder;
Expand Down Expand Up @@ -212,6 +215,83 @@ public void testRemoveJobsAndDatafeeds_removeSome() {
}


public void testLimitWrites_GivenBelowLimit() {
MlConfigMigrator.JobsAndDatafeeds jobsAndDatafeeds = MlConfigMigrator.limitWrites(Collections.emptyList(), Collections.emptyMap());
assertThat(jobsAndDatafeeds.datafeedConfigs, empty());
assertThat(jobsAndDatafeeds.jobs, empty());

List<DatafeedConfig> datafeeds = new ArrayList<>();
Map<String, Job> jobs = new HashMap<>();

int numDatafeeds = MlConfigMigrator.MAX_BULK_WRITE_SIZE / 2;
for (int i=0; i<numDatafeeds; i++) {
String jobId = "job" + i;
jobs.put(jobId, JobTests.buildJobBuilder(jobId).build());
datafeeds.add(createCompatibleDatafeed(jobId));
}

jobsAndDatafeeds = MlConfigMigrator.limitWrites(datafeeds, jobs);
assertThat(jobsAndDatafeeds.datafeedConfigs, hasSize(numDatafeeds));
assertThat(jobsAndDatafeeds.jobs, hasSize(numDatafeeds));
}

public void testLimitWrites_GivenAboveLimit() {
List<DatafeedConfig> datafeeds = new ArrayList<>();
Map<String, Job> jobs = new HashMap<>();

int numDatafeeds = MlConfigMigrator.MAX_BULK_WRITE_SIZE / 2 + 10;
for (int i=0; i<numDatafeeds; i++) {
String jobId = "job" + i;
jobs.put(jobId, JobTests.buildJobBuilder(jobId).build());
datafeeds.add(createCompatibleDatafeed(jobId));
}

MlConfigMigrator.JobsAndDatafeeds jobsAndDatafeeds = MlConfigMigrator.limitWrites(datafeeds, jobs);
assertEquals(MlConfigMigrator.MAX_BULK_WRITE_SIZE, jobsAndDatafeeds.totalCount());
assertThat(jobsAndDatafeeds.datafeedConfigs, hasSize(MlConfigMigrator.MAX_BULK_WRITE_SIZE / 2));
assertThat(jobsAndDatafeeds.jobs, hasSize(MlConfigMigrator.MAX_BULK_WRITE_SIZE / 2));

// assert that for each datafeed its corresponding job is selected
Set<String> selectedJobIds = jobsAndDatafeeds.jobs.stream().map(Job::getId).collect(Collectors.toSet());
Set<String> datafeedJobIds = jobsAndDatafeeds.datafeedConfigs.stream().map(DatafeedConfig::getJobId).collect(Collectors.toSet());
assertEquals(selectedJobIds, datafeedJobIds);
}

public void testLimitWrites_GivenMoreJobsThanDatafeeds() {
List<DatafeedConfig> datafeeds = new ArrayList<>();
Map<String, Job> jobs = new HashMap<>();

int numDatafeeds = MlConfigMigrator.MAX_BULK_WRITE_SIZE / 2 - 10;
for (int i=0; i<numDatafeeds; i++) {
String jobId = "job" + i;
jobs.put(jobId, JobTests.buildJobBuilder(jobId).build());
datafeeds.add(createCompatibleDatafeed(jobId));
}

for (int i=numDatafeeds; i<numDatafeeds + 40; i++) {
String jobId = "job" + i;
jobs.put(jobId, JobTests.buildJobBuilder(jobId).build());
}

MlConfigMigrator.JobsAndDatafeeds jobsAndDatafeeds = MlConfigMigrator.limitWrites(datafeeds, jobs);
assertEquals(MlConfigMigrator.MAX_BULK_WRITE_SIZE, jobsAndDatafeeds.totalCount());
assertThat(jobsAndDatafeeds.datafeedConfigs, hasSize(numDatafeeds));
assertThat(jobsAndDatafeeds.jobs, hasSize(MlConfigMigrator.MAX_BULK_WRITE_SIZE - numDatafeeds));

// assert that for each datafeed its corresponding job is selected
Set<String> selectedJobIds = jobsAndDatafeeds.jobs.stream().map(Job::getId).collect(Collectors.toSet());
Set<String> datafeedJobIds = jobsAndDatafeeds.datafeedConfigs.stream().map(DatafeedConfig::getJobId).collect(Collectors.toSet());
assertTrue(selectedJobIds.containsAll(datafeedJobIds));
}

public void testLimitWrites_GivenNullJob() {
List<DatafeedConfig> datafeeds = Collections.singletonList(createCompatibleDatafeed("no-job-for-this-datafeed"));
MlConfigMigrator.JobsAndDatafeeds jobsAndDatafeeds = MlConfigMigrator.limitWrites(datafeeds, Collections.emptyMap());

assertThat(jobsAndDatafeeds.datafeedConfigs, hasSize(1));
assertThat(jobsAndDatafeeds.jobs, empty());
}

private DatafeedConfig createCompatibleDatafeed(String jobId) {
// create a datafeed without aggregations or anything
// else that may cause validation errors
Expand Down

0 comments on commit a842453

Please sign in to comment.