Remove filter rewrite optimization for range aggregations when segmen…

…t is not effective match all (opensearch-project#15194) --------- Signed-off-by: Finn Carroll <[email protected]>
wdongyu · Aug 22, 2024 · 00bf0d8 · 00bf0d8
1 parent 3788330
commit 00bf0d8
Show file tree

Hide file tree

Showing 8 changed files with 320 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -49,6 +49,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
 - Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
 - Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
+- Fix range aggregation optimization ignoring top level queries ([#15194](https://github.com/opensearch-project/OpenSearch/pull/15194))
 
 ### Security
 

diff --git a/...-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/360_date_histogram.yml b/...-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/360_date_histogram.yml
@@ -61,3 +61,94 @@ setup:
   - match: { aggregations.histo.buckets.8.doc_count: 1 }
   - match: { aggregations.histo.buckets.12.key_as_string: "2016-06-01T00:00:00.000Z" }
   - match: { aggregations.histo.buckets.12.doc_count: 1 }
+
+---
+"Date histogram aggregation w/ filter query test":
+  - skip:
+      version: " - 2.99.99"
+      reason: Backport fix to 2.16
+
+  - do:
+      bulk:
+        refresh: true
+        index: dhisto-agg-w-query
+        body:
+          - '{"index": {}}'
+          - '{"routing": "route1", "date": "2024-08-12", "dow": "monday"}'
+          - '{"index": {}}'
+          - '{"routing": "route1", "date": "2024-08-14", "dow": "wednesday"}'
+          - '{"index": {}}'
+          - '{"routing": "route1", "date": "2024-08-19", "dow": "monday"}'
+          - '{"index": {}}'
+          - '{"routing": "route2", "date": "2024-08-13", "dow": "tuesday"}'
+          - '{"index": {}}'
+          - '{"routing": "route2", "date": "2024-08-15", "dow": "thursday"}'
+
+  - do:
+      search:
+        index: dhisto-agg-w-query
+        body:
+          query:
+            bool:
+              must:
+                match_all: {}
+              filter:
+                - terms:
+                    routing:
+                      - "route1"
+          aggregations:
+            weekHisto:
+              date_histogram:
+                field: date
+                calendar_interval: week
+          _source: false
+
+  - match: { hits.total.value: 3 }
+  - match: { aggregations.weekHisto.buckets.0.doc_count: 2 }
+  - match: { aggregations.weekHisto.buckets.1.doc_count: 1 }
+
+---
+"Date histogram aggregation w/ shared field range test":
+  - do:
+      bulk:
+        refresh: true
+        index: dhisto-agg-w-query
+        body:
+          - '{"index": {}}'
+          - '{"date": "2024-10-31"}'
+          - '{"index": {}}'
+          - '{"date": "2024-11-11"}'
+          - '{"index": {}}'
+          - '{"date": "2024-11-28"}'
+          - '{"index": {}}'
+          - '{"date": "2024-12-25"}'
+          - '{"index": {}}'
+          - '{"date": "2025-01-01"}'
+          - '{"index": {}}'
+          - '{"date": "2025-02-14"}'
+
+  - do:
+      search:
+        index: dhisto-agg-w-query
+        body:
+          profile: true
+          query:
+            range:
+              date:
+                gte: "2024-01-01"
+                lt: "2025-01-01"
+          aggregations:
+            monthHisto:
+              date_histogram:
+                field: date
+                calendar_interval: month
+          _source: false
+
+  - match: { hits.total.value: 4 }
+  - match: { aggregations.monthHisto.buckets.0.doc_count: 1 }
+  - match: { aggregations.monthHisto.buckets.1.doc_count: 2 }
+  - match: { aggregations.monthHisto.buckets.2.doc_count: 1 }
+  - match: { profile.shards.0.aggregations.0.debug.optimized_segments: 1 }
+  - match: { profile.shards.0.aggregations.0.debug.unoptimized_segments: 0 }
+  - match: { profile.shards.0.aggregations.0.debug.leaf_visited: 0 }
+  - match: { profile.shards.0.aggregations.0.debug.inner_visited: 0 }
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/40_range.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/40_range.yml
@@ -673,3 +673,82 @@ setup:
   - match: { aggregations.my_range.buckets.3.from: 1.5 }
   - is_false:  aggregations.my_range.buckets.3.to
   - match: { aggregations.my_range.buckets.3.doc_count: 2 }
+
+---
+"Filter query w/ aggregation test":
+  - skip:
+      version: " - 2.99.99"
+      reason: Backport fix to 2.16
+
+  - do:
+      bulk:
+        refresh: true
+        index: range-agg-w-query
+        body:
+          - '{"index": {}}'
+          - '{"routing": "route1", "v": -10, "date": "2024-10-29"}'
+          - '{"index": {}}'
+          - '{"routing": "route1", "v": -5, "date": "2024-10-30"}'
+          - '{"index": {}}'
+          - '{"routing": "route1", "v": 10, "date": "2024-10-31"}'
+          - '{"index": {}}'
+          - '{"routing": "route2", "v": 15, "date": "2024-11-01"}'
+          - '{"index": {}}'
+          - '{"routing": "route2", "v": 20, "date": "2024-11-02"}'
+
+  - do:
+      search:
+        index: range-agg-w-query
+        body:
+          query:
+            bool:
+              must:
+                match_all: {}
+              filter:
+                - terms:
+                    routing:
+                      - "route1"
+          aggregations:
+            NegPosAgg:
+              range:
+                field: v
+                keyed: true
+                ranges:
+                  - to: 0
+                    key: "0"
+                  - from: 0
+                    key: "1"
+          _source: false
+
+  - match: { hits.total.value: 3 }
+  - match: { aggregations.NegPosAgg.buckets.0.doc_count: 2 }
+  - match: { aggregations.NegPosAgg.buckets.1.doc_count: 1 }
+
+  - do:
+      search:
+        index: range-agg-w-query
+        body:
+          query:
+            bool:
+              must:
+                match_all: {}
+              filter:
+                - terms:
+                    routing:
+                      - "route1"
+          aggregations:
+            HalloweenAgg:
+              date_range:
+                field: date
+                format: "yyyy-MM-dd"
+                keyed: true
+                ranges:
+                  - to: "2024-11-01"
+                    key: "to-october"
+                  - from: "2024-11-01"
+                    key: "from-september"
+          _source: false
+
+  - match: { hits.total.value: 3 }
+  - match: { aggregations.HalloweenAgg.buckets.to-october.doc_count: 3 }
+  - match: { aggregations.HalloweenAgg.buckets.from-september.doc_count: 0 }
diff --git a/...c/main/java/org/opensearch/search/aggregations/bucket/filterrewrite/AggregatorBridge.java b/...c/main/java/org/opensearch/search/aggregations/bucket/filterrewrite/AggregatorBridge.java
@@ -10,7 +10,10 @@
 
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PointValues;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Weight;
 import org.opensearch.index.mapper.MappedFieldType;
+import org.opensearch.search.internal.SearchContext;
 
 import java.io.IOException;
 import java.util.function.BiConsumer;
@@ -81,4 +84,19 @@ abstract FilterRewriteOptimizationContext.DebugInfo tryOptimize(
         BiConsumer<Long, Long> incrementDocCount,
         Ranges ranges
     ) throws IOException;
+
+    /**
+     * Checks whether the top level query matches all documents on the segment
+     *
+     * <p>This method creates a weight from the search context's query and checks whether the weight's
+     * document count matches the total number of documents in the leaf reader context.
+     *
+     * @param ctx      the search context
+     * @param leafCtx  the leaf reader context for the segment
+     * @return {@code true} if the segment matches all documents, {@code false} otherwise
+     */
+    public static boolean segmentMatchAll(SearchContext ctx, LeafReaderContext leafCtx) throws IOException {
+        Weight weight = ctx.query().rewrite(ctx.searcher()).createWeight(ctx.searcher(), ScoreMode.COMPLETE_NO_SCORES, 1f);
+        return weight != null && weight.count(leafCtx) == leafCtx.reader().numDocs();
+    }
 }
diff --git a/...rg/opensearch/search/aggregations/bucket/filterrewrite/DateHistogramAggregatorBridge.java b/...rg/opensearch/search/aggregations/bucket/filterrewrite/DateHistogramAggregatorBridge.java
@@ -11,8 +11,6 @@
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.PointValues;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Weight;
 import org.opensearch.common.Rounding;
 import org.opensearch.index.mapper.DateFieldMapper;
 import org.opensearch.index.mapper.MappedFieldType;
@@ -156,19 +154,4 @@ private static long getBucketOrd(long bucketOrd) {
     * Provides a function to produce bucket ordinals from the lower bound of the range
     */
     protected abstract Function<Long, Long> bucketOrdProducer();
-
-    /**
-     * Checks whether the top level query matches all documents on the segment
-     *
-     * <p>This method creates a weight from the search context's query and checks whether the weight's
-     * document count matches the total number of documents in the leaf reader context.
-     *
-     * @param ctx      the search context
-     * @param leafCtx  the leaf reader context for the segment
-     * @return {@code true} if the segment matches all documents, {@code false} otherwise
-     */
-    public static boolean segmentMatchAll(SearchContext ctx, LeafReaderContext leafCtx) throws IOException {
-        Weight weight = ctx.query().rewrite(ctx.searcher()).createWeight(ctx.searcher(), ScoreMode.COMPLETE_NO_SCORES, 1f);
-        return weight != null && weight.count(leafCtx) == leafCtx.reader().numDocs();
-    }
 }
diff --git a/...n/java/org/opensearch/search/aggregations/bucket/filterrewrite/RangeAggregatorBridge.java b/...n/java/org/opensearch/search/aggregations/bucket/filterrewrite/RangeAggregatorBridge.java
@@ -80,7 +80,6 @@ final FilterRewriteOptimizationContext.DebugInfo tryOptimize(
         Ranges ranges
     ) throws IOException {
         int size = Integer.MAX_VALUE;
-
         BiConsumer<Integer, Integer> incrementFunc = (activeIndex, docCount) -> {
             long bucketOrd = bucketOrdProducer().apply(activeIndex);
             incrementDocCount.accept(bucketOrd, (long) docCount);

diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/range/RangeAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/range/RangeAggregator.java
@@ -70,6 +70,7 @@
 import java.util.function.Function;
 
 import static org.opensearch.core.xcontent.ConstructingObjectParser.optionalConstructorArg;
+import static org.opensearch.search.aggregations.bucket.filterrewrite.AggregatorBridge.segmentMatchAll;
 
 /**
  * Aggregate all docs that match given ranges.
@@ -310,8 +311,9 @@ public ScoreMode scoreMode() {
 
     @Override
     public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, final LeafBucketCollector sub) throws IOException {
-        boolean optimized = filterRewriteOptimizationContext.tryOptimize(ctx, this::incrementBucketDocCount, false);
-        if (optimized) throw new CollectionTerminatedException();
+        if (segmentMatchAll(context, ctx) && filterRewriteOptimizationContext.tryOptimize(ctx, this::incrementBucketDocCount, false)) {
+            throw new CollectionTerminatedException();
+        }
 
         final SortedNumericDoubleValues values = valuesSource.doubleValues(ctx);
         return new LeafBucketCollectorBase(sub, values) {