Skip to content

Commit

Permalink
Rewrite indexFieldHasDuplicateData
Browse files Browse the repository at this point in the history
Rewrite how duplicate data is detected in the index
to have less number of iterations.
For every iteration check if the number of documents
in it is more than a half of documents in the segment.
  • Loading branch information
mayya-sharipova committed Nov 12, 2019
1 parent 738c785 commit 5870fd7
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 90 deletions.
46 changes: 24 additions & 22 deletions server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
Original file line number Diff line number Diff line change
Expand Up @@ -556,14 +556,14 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
return true;
}


/**
* Returns true if more than 50% of data in the index have the same value
* The evaluation is approximation based on finding the median value and estimating its count
* Returns true if the total count of median values is greater or equal to half of the total count of documents
*/
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
long globalDocCount = 0;
long globalMedianCount = 0;
long docsOpt = 0; // number of docs in segments that would benefit optimization
long docsNoOpt = 0; // number of docs in segments that would NOT benefit optimization, e.g. docs in segments with duplicate data
for (LeafReaderContext lrc : reader.leaves()) {
PointValues pointValues = lrc.reader().getPointValues(field);
if (pointValues == null) continue;
Expand All @@ -572,31 +572,33 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
continue;
}
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
globalDocCount += docCount;
long medianValue = estimateMedianValue(pointValues);
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
globalMedianCount += medianCount;
}
return (globalMedianCount >= globalDocCount/2);
}

static long estimateMedianValue(PointValues pointValues) throws IOException {
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
while (minValue < maxValue) {
long avgValue = Math.floorDiv(minValue + maxValue, 2);
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
if (countLeft >= countRight) {
maxValue = avgValue;
int duplicateDocCount = docCount/2; // expected doc count of duplicate data
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
boolean hasDuplicateData = true;
while ((minValue < maxValue) && hasDuplicateData) {
long avgValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
if ((countLeft >= countRight) && (countLeft > duplicateDocCount) ) {
maxValue = avgValue;
} else if ((countRight > countLeft) && (countRight > duplicateDocCount)) {
minValue = avgValue + 1;
} else {
hasDuplicateData = false;
}
}
if (hasDuplicateData) {
docsNoOpt += docCount;
} else {
minValue = avgValue + 1;
docsOpt += docCount;
}
}
return maxValue;
return (docsNoOpt > docsOpt);
}

static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
final byte[] minValueAsBytes = new byte[Long.BYTES];
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
final byte[] maxValueAsBytes = new byte[Long.BYTES];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,8 @@
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDWriter;
import org.elasticsearch.action.search.SearchTask;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.mapper.DateFieldMapper;
Expand All @@ -96,15 +91,12 @@
import java.util.Collections;
import java.util.List;

import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
import static org.hamcrest.Matchers.anyOf;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.lessThan;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.mockito.Mockito.spy;
Expand Down Expand Up @@ -712,66 +704,28 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
}

public void testIndexHasDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.6);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

assertEquals(expectedMedianValue, medianValue);
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
}
}
}

public void testIndexHasNotDuplicateData() throws IOException {
int valuesCount = 5000;
int maxPointsInLeafNode = 40;
long expectedMedianCount = (long)(valuesCount * 0.35);
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);

try (Directory dir = newDirectory()) {
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
byte[] longBytes = new byte[8];
for (int docId = 0; docId < valuesCount; docId++) {
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
LongPoint.encodeDimension(value, longBytes, 0);
w.add(longBytes, docId);
}
long indexFP;
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
indexFP = w.finish(out);
}
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
in.seek(indexFP);
BKDReader r = new BKDReader(in);
long medianValue = estimateMedianValue(r);
long medianCount = estimatePointCount(r, medianValue, medianValue);

// can't make any assertion about the values of medianValue and medianCount
// as BKDReader::estimatePointCount can be really off for non-duplicate data
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
}
int docsCount = 7000;
float duplicateRatio1 = 0.6f;
float duplicateRatio2 = 0.35f;
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
for (int docId = 0; docId < docsCount; docId++) {
Document doc = new Document();
long value = (randomFloat() < duplicateRatio1) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
long value2 = (randomFloat() < duplicateRatio2) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
doc.add(new LongPoint("duplicateField", value));
doc.add(new LongPoint("notDuplicateField", value2));
writer.addDocument(doc);
}
writer.close();
final IndexReader reader = DirectoryReader.open(dir);
boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
reader.close();
dir.close();
assertTrue(hasDuplicateData);
assertFalse(hasDuplicateData2);
}

public void testMaxScoreQueryVisitor() {
Expand Down

0 comments on commit 5870fd7

Please sign in to comment.