Rewrite indexFieldHasDuplicateData

Rewrite how duplicate data is detected in the index to have less number of iterations. For every iteration check if the number of documents in it is more than a half of documents in the segment.
elastic · Nov 12, 2019 · 5870fd7 · 5870fd7
1 parent 738c785
commit 5870fd7
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 90 deletions.
diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -556,14 +556,14 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
         return true;
     }
 
+
     /**
      * Returns true if more than 50% of data in the index have the same value
      * The evaluation is approximation based on finding the median value and estimating its count
-     * Returns true if the total count of median values is greater or equal to half of the total count of documents
      */
     static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
-        long globalDocCount = 0;
-        long globalMedianCount = 0;
+        long docsOpt = 0; // number of docs in segments that would benefit optimization
+        long docsNoOpt = 0; // number of docs in segments that would NOT benefit optimization, e.g. docs in segments with duplicate data
         for (LeafReaderContext lrc : reader.leaves()) {
             PointValues pointValues = lrc.reader().getPointValues(field);
             if (pointValues == null) continue;
@@ -572,31 +572,33 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
                 continue;
             }
             assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
-            globalDocCount += docCount;
-            long medianValue = estimateMedianValue(pointValues);
-            long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
-            globalMedianCount += medianCount;
-        }
-        return (globalMedianCount >= globalDocCount/2);
-    }
 
-    static long estimateMedianValue(PointValues pointValues) throws IOException {
-        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
-        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
-        while (minValue < maxValue) {
-            long avgValue = Math.floorDiv(minValue + maxValue, 2);
-            long countLeft = estimatePointCount(pointValues, minValue, avgValue);
-            long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
-            if (countLeft >= countRight) {
-                maxValue = avgValue;
+            int duplicateDocCount = docCount/2; // expected doc count of duplicate data
+            long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+            long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
+            boolean hasDuplicateData = true;
+            while ((minValue < maxValue) && hasDuplicateData) {
+                long avgValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
+                long countLeft = estimatePointCount(pointValues, minValue, avgValue);
+                long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+                if ((countLeft >= countRight) && (countLeft > duplicateDocCount) ) {
+                    maxValue = avgValue;
+                } else if ((countRight > countLeft) && (countRight > duplicateDocCount)) {
+                    minValue = avgValue + 1;
+                } else {
+                    hasDuplicateData = false;
+                }
+            }
+            if (hasDuplicateData) {
+                docsNoOpt += docCount;
             } else {
-                minValue = avgValue + 1;
+                docsOpt += docCount;
             }
         }
-        return maxValue;
+        return (docsNoOpt > docsOpt);
     }
 
-    static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+    private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];
         LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
         final byte[] maxValueAsBytes = new byte[Long.BYTES];

diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -67,13 +67,8 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.bkd.BKDReader;
-import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -96,15 +91,12 @@
 import java.util.Collections;
 import java.util.List;
 
-import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
-import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
+import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
 import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.instanceOf;
-import static org.hamcrest.Matchers.lessThan;
-import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import static org.mockito.Mockito.spy;
@@ -712,66 +704,28 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
     }
 
     public void testIndexHasDuplicateData() throws IOException {
-        int valuesCount = 5000;
-        int maxPointsInLeafNode = 40;
-        long expectedMedianCount = (long)(valuesCount * 0.6);
-        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
-
-        try (Directory dir = newDirectory()) {
-            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
-            byte[] longBytes = new byte[8];
-            for (int docId = 0; docId < valuesCount; docId++) {
-                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
-                LongPoint.encodeDimension(value, longBytes, 0);
-                w.add(longBytes, docId);
-            }
-            long indexFP;
-            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
-                indexFP = w.finish(out);
-            }
-            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
-                in.seek(indexFP);
-                BKDReader r = new BKDReader(in);
-                long medianValue = estimateMedianValue(r);
-                long medianCount = estimatePointCount(r, medianValue, medianValue);
-
-                assertEquals(expectedMedianValue, medianValue);
-                assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
-                assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
-                assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
-            }
-        }
-    }
-
-    public void testIndexHasNotDuplicateData() throws IOException {
-        int valuesCount = 5000;
-        int maxPointsInLeafNode = 40;
-        long expectedMedianCount = (long)(valuesCount * 0.35);
-        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
-
-        try (Directory dir = newDirectory()) {
-            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
-            byte[] longBytes = new byte[8];
-            for (int docId = 0; docId < valuesCount; docId++) {
-                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
-                LongPoint.encodeDimension(value, longBytes, 0);
-                w.add(longBytes, docId);
-            }
-            long indexFP;
-            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
-                indexFP = w.finish(out);
-            }
-            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
-                in.seek(indexFP);
-                BKDReader r = new BKDReader(in);
-                long medianValue = estimateMedianValue(r);
-                long medianCount = estimatePointCount(r, medianValue, medianValue);
-
-                // can't make any assertion about the values of medianValue and medianCount
-                // as BKDReader::estimatePointCount can be really off for non-duplicate data
-                assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
-            }
+        int docsCount = 7000;
+        float duplicateRatio1 = 0.6f;
+        float duplicateRatio2 = 0.35f;
+        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
+        Directory dir = newDirectory();
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
+        for (int docId = 0; docId < docsCount; docId++) {
+            Document doc = new Document();
+            long value = (randomFloat() < duplicateRatio1) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+            long value2 = (randomFloat() < duplicateRatio2) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+            doc.add(new LongPoint("duplicateField", value));
+            doc.add(new LongPoint("notDuplicateField", value2));
+            writer.addDocument(doc);
         }
+        writer.close();
+        final IndexReader reader = DirectoryReader.open(dir);
+        boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
+        boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
+        reader.close();
+        dir.close();
+        assertTrue(hasDuplicateData);
+        assertFalse(hasDuplicateData2);
     }
 
     public void testMaxScoreQueryVisitor() {