feat(aggregators/metric): Add a top_hits aggregator (#2198)

* feat(aggregators/metric): Implement a top_hits aggregator * fix: Expose get_fields * fix: Serializer for top_hits request Also removes extraneous the extraneous third-party serialization helper. * chore: Avert panick on parsing invalid top_hits query * refactor: Allow multiple field names from aggregations * perf: Replace binary heap with TopNComputer * fix: Avoid comparator inversion by ComparableDoc * fix: Rank missing field values lower than present values * refactor: Make KeyOrder a struct * feat: Rough attempt at docvalue_fields * feat: Complete stab at docvalue_fields - Rename "SearchResult*" => "Retrieval*" - Revert Vec => HashMap for aggregation accessors. - Split accessors for core aggregation and field retrieval. - Resolve globbed field names in docvalue_fields retrieval. - Handle strings/bytes and other column types with DynamicColumn * test(unit): Add tests for top_hits aggregator * fix: docfield_value field globbing * test(unit): Include dynamic fields * fix: Value -> OwnedValue * fix: Use OwnedValue's native Null variant * chore: Improve readability of test asserts * chore: Remove DocAddress from top_hits result * docs: Update aggregator doc * revert: accidental doc test * chore: enable time macros only for tests * chore: Apply suggestions from review * chore: Apply suggestions from review * fix: Retrieve all values for fields * test(unit): Update for multi-value retrieval * chore: Assert term existence * feat: Include all columns for a column name Since a (name, type) constitutes a unique column. * fix: Resolve json fields Introduces a translation step to bridge the difference between ColumnarReaders null `\0` separated json field keys to the common `.` separated used by SegmentReader. Although, this should probably be the default behavior for ColumnarReader's public API perhaps. * chore: Address review on mutability * chore: s/segment_id/segment_ordinal instances of SegmentOrdinal * chore: Revert erroneous grammar change
quickwit-oss · Jan 26, 2024 · 0e04ec3 · 0e04ec3
1 parent 9b7f3a5
commit 0e04ec3
Show file tree

Hide file tree

Showing 17 changed files with 1,134 additions and 148 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -77,6 +77,7 @@ futures = "0.3.21"
 paste = "1.0.11"
 more-asserts = "0.3.1"
 rand_distr = "0.4.3"
+time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
 
 [target.'cfg(not(windows))'.dev-dependencies]
 criterion = { version = "0.5", default-features = false }

diff --git a/src/aggregation/agg_req.rs b/src/aggregation/agg_req.rs
@@ -35,7 +35,7 @@ use super::bucket::{
 };
 use super::metric::{
     AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
-    PercentilesAggregationReq, StatsAggregation, SumAggregation,
+    PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
 };
 
 /// The top-level aggregation request structure, which contains [`Aggregation`] and their user
@@ -93,7 +93,12 @@ impl Aggregation {
     }
 
     fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
-        fast_field_names.insert(self.agg.get_fast_field_name().to_string());
+        fast_field_names.extend(
+            self.agg
+                .get_fast_field_names()
+                .iter()
+                .map(|s| s.to_string()),
+        );
         fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
     }
 }
@@ -147,23 +152,27 @@ pub enum AggregationVariants {
     /// Computes the sum of the extracted values.
     #[serde(rename = "percentiles")]
     Percentiles(PercentilesAggregationReq),
+    /// Finds the top k values matching some order
+    #[serde(rename = "top_hits")]
+    TopHits(TopHitsAggregation),
 }
 
 impl AggregationVariants {
-    /// Returns the name of the field used by the aggregation.
-    pub fn get_fast_field_name(&self) -> &str {
+    /// Returns the name of the fields used by the aggregation.
+    pub fn get_fast_field_names(&self) -> Vec<&str> {
         match self {
-            AggregationVariants::Terms(terms) => terms.field.as_str(),
-            AggregationVariants::Range(range) => range.field.as_str(),
-            AggregationVariants::Histogram(histogram) => histogram.field.as_str(),
-            AggregationVariants::DateHistogram(histogram) => histogram.field.as_str(),
-            AggregationVariants::Average(avg) => avg.field_name(),
-            AggregationVariants::Count(count) => count.field_name(),
-            AggregationVariants::Max(max) => max.field_name(),
-            AggregationVariants::Min(min) => min.field_name(),
-            AggregationVariants::Stats(stats) => stats.field_name(),
-            AggregationVariants::Sum(sum) => sum.field_name(),
-            AggregationVariants::Percentiles(per) => per.field_name(),
+            AggregationVariants::Terms(terms) => vec![terms.field.as_str()],
+            AggregationVariants::Range(range) => vec![range.field.as_str()],
+            AggregationVariants::Histogram(histogram) => vec![histogram.field.as_str()],
+            AggregationVariants::DateHistogram(histogram) => vec![histogram.field.as_str()],
+            AggregationVariants::Average(avg) => vec![avg.field_name()],
+            AggregationVariants::Count(count) => vec![count.field_name()],
+            AggregationVariants::Max(max) => vec![max.field_name()],
+            AggregationVariants::Min(min) => vec![min.field_name()],
+            AggregationVariants::Stats(stats) => vec![stats.field_name()],
+            AggregationVariants::Sum(sum) => vec![sum.field_name()],
+            AggregationVariants::Percentiles(per) => vec![per.field_name()],
+            AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
         }
     }
 

diff --git a/src/aggregation/agg_req_with_accessor.rs b/src/aggregation/agg_req_with_accessor.rs
@@ -1,6 +1,9 @@
 //! This will enhance the request tree with access to the fastfield and metadata.
 
-use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
+use std::collections::HashMap;
+use std::io;
+
+use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
 
 use super::agg_limits::ResourceLimitGuard;
 use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
@@ -14,7 +17,7 @@ use super::metric::{
 use super::segment_agg_result::AggregationLimits;
 use super::VecWithNames;
 use crate::aggregation::{f64_to_fastfield_u64, Key};
-use crate::SegmentReader;
+use crate::{SegmentOrdinal, SegmentReader};
 
 #[derive(Default)]
 pub(crate) struct AggregationsWithAccessor {
@@ -32,6 +35,7 @@ impl AggregationsWithAccessor {
 }
 
 pub struct AggregationWithAccessor {
+    pub(crate) segment_ordinal: SegmentOrdinal,
     /// In general there can be buckets without fast field access, e.g. buckets that are created
     /// based on search terms. That is not that case currently, but eventually this needs to be
     /// Option or moved.
@@ -44,10 +48,16 @@ pub struct AggregationWithAccessor {
     pub(crate) limits: ResourceLimitGuard,
     pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
     /// Used for missing term aggregation, which checks all columns for existence.
+    /// And also for `top_hits` aggregation, which may sort on multiple fields.
     /// By convention the missing aggregation is chosen, when this property is set
     /// (instead bein set in `agg`).
     /// If this needs to used by other aggregations, we need to refactor this.
-    pub(crate) accessors: Vec<Column<u64>>,
+    // NOTE: we can make all other aggregations use this instead of the `accessor` and `field_type`
+    // (making them obsolete) But will it have a performance impact?
+    pub(crate) accessors: Vec<(Column<u64>, ColumnType)>,
+    /// Map field names to all associated column accessors.
+    /// This field is used for `docvalue_fields`, which is currently only supported for `top_hits`.
+    pub(crate) value_accessors: HashMap<String, Vec<DynamicColumn>>,
     pub(crate) agg: Aggregation,
 }
 
@@ -57,19 +67,55 @@ impl AggregationWithAccessor {
         agg: &Aggregation,
         sub_aggregation: &Aggregations,
         reader: &SegmentReader,
+        segment_ordinal: SegmentOrdinal,
         limits: AggregationLimits,
     ) -> crate::Result<Vec<AggregationWithAccessor>> {
-        let add_agg_with_accessor = |accessor: Column<u64>,
+        let mut agg = agg.clone();
+
+        let add_agg_with_accessor = |agg: &Aggregation,
+                                     accessor: Column<u64>,
                                      column_type: ColumnType,
                                      aggs: &mut Vec<AggregationWithAccessor>|
          -> crate::Result<()> {
             let res = AggregationWithAccessor {
+                segment_ordinal,
                 accessor,
-                accessors: Vec::new(),
+                accessors: Default::default(),
+                value_accessors: Default::default(),
                 field_type: column_type,
                 sub_aggregation: get_aggs_with_segment_accessor_and_validate(
                     sub_aggregation,
                     reader,
+                    segment_ordinal,
+                    &limits,
+                )?,
+                agg: agg.clone(),
+                limits: limits.new_guard(),
+                missing_value_for_accessor: None,
+                str_dict_column: None,
+                column_block_accessor: Default::default(),
+            };
+            aggs.push(res);
+            Ok(())
+        };
+
+        let add_agg_with_accessors = |agg: &Aggregation,
+                                      accessors: Vec<(Column<u64>, ColumnType)>,
+                                      aggs: &mut Vec<AggregationWithAccessor>,
+                                      value_accessors: HashMap<String, Vec<DynamicColumn>>|
+         -> crate::Result<()> {
+            let (accessor, field_type) = accessors.first().expect("at least one accessor");
+            let res = AggregationWithAccessor {
+                segment_ordinal,
+                // TODO: We should do away with the `accessor` field altogether
+                accessor: accessor.clone(),
+                value_accessors,
+                field_type: *field_type,
+                accessors,
+                sub_aggregation: get_aggs_with_segment_accessor_and_validate(
+                    sub_aggregation,
+                    reader,
+                    segment_ordinal,
                     &limits,
                 )?,
                 agg: agg.clone(),
@@ -84,32 +130,36 @@ impl AggregationWithAccessor {
 
         let mut res: Vec<AggregationWithAccessor> = Vec::new();
         use AggregationVariants::*;
-        match &agg.agg {
+
+        match agg.agg {
             Range(RangeAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             }) => {
                 let (accessor, column_type) =
                     get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
-                add_agg_with_accessor(accessor, column_type, &mut res)?;
+                add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
             }
             Histogram(HistogramAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             }) => {
                 let (accessor, column_type) =
                     get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
-                add_agg_with_accessor(accessor, column_type, &mut res)?;
+                add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
             }
             DateHistogram(DateHistogramAggregationReq {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             }) => {
                 let (accessor, column_type) =
                     // Only DateTime is supported for DateHistogram
                     get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
-                add_agg_with_accessor(accessor, column_type, &mut res)?;
+                add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
             }
             Terms(TermsAggregation {
-                field: field_name,
-                missing,
+                field: ref field_name,
+                ref missing,
                 ..
             }) => {
                 let str_dict_column = reader.fast_fields().str(field_name)?;
@@ -162,24 +212,11 @@ impl AggregationWithAccessor {
                     let column_and_types =
                         get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;
 
-                    let accessors: Vec<Column> =
-                        column_and_types.iter().map(|(a, _)| a.clone()).collect();
-                    let agg_wit_acc = AggregationWithAccessor {
-                        missing_value_for_accessor: None,
-                        accessor: accessors[0].clone(),
-                        accessors,
-                        field_type: ColumnType::U64,
-                        sub_aggregation: get_aggs_with_segment_accessor_and_validate(
-                            sub_aggregation,
-                            reader,
-                            &limits,
-                        )?,
-                        agg: agg.clone(),
-                        str_dict_column: str_dict_column.clone(),
-                        limits: limits.new_guard(),
-                        column_block_accessor: Default::default(),
-                    };
-                    res.push(agg_wit_acc);
+                    let accessors = column_and_types
+                        .iter()
+                        .map(|c_t| (c_t.0.clone(), c_t.1))
+                        .collect();
+                    add_agg_with_accessors(&agg, accessors, &mut res, Default::default())?;
                 }
 
                 for (accessor, column_type) in column_and_types {
@@ -189,21 +226,25 @@ impl AggregationWithAccessor {
                         missing.clone()
                     };
 
-                    let missing_value_for_accessor =
-                        if let Some(missing) = missing_value_term_agg.as_ref() {
-                            get_missing_val(column_type, missing, agg.agg.get_fast_field_name())?
-                        } else {
-                            None
-                        };
+                    let missing_value_for_accessor = if let Some(missing) =
+                        missing_value_term_agg.as_ref()
+                    {
+                        get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
+                    } else {
+                        None
+                    };
 
                     let agg = AggregationWithAccessor {
+                        segment_ordinal,
                         missing_value_for_accessor,
                         accessor,
-                        accessors: Vec::new(),
+                        accessors: Default::default(),
+                        value_accessors: Default::default(),
                         field_type: column_type,
                         sub_aggregation: get_aggs_with_segment_accessor_and_validate(
                             sub_aggregation,
                             reader,
+                            segment_ordinal,
                             &limits,
                         )?,
                         agg: agg.clone(),
@@ -215,34 +256,63 @@ impl AggregationWithAccessor {
                 }
             }
             Average(AverageAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             })
             | Count(CountAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             })
             | Max(MaxAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             })
             | Min(MinAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             })
             | Stats(StatsAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             })
             | Sum(SumAggregation {
-                field: field_name, ..
+                field: ref field_name,
+                ..
             }) => {
                 let (accessor, column_type) =
                     get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
-                add_agg_with_accessor(accessor, column_type, &mut res)?;
+                add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
             }
-            Percentiles(percentiles) => {
+            Percentiles(ref percentiles) => {
                 let (accessor, column_type) = get_ff_reader(
                     reader,
                     percentiles.field_name(),
                     Some(get_numeric_or_date_column_types()),
                 )?;
-                add_agg_with_accessor(accessor, column_type, &mut res)?;
+                add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
+            }
+            TopHits(ref mut top_hits) => {
+                top_hits.validate_and_resolve(reader.fast_fields().columnar())?;
+                let accessors: Vec<(Column<u64>, ColumnType)> = top_hits
+                    .field_names()
+                    .iter()
+                    .map(|field| {
+                        get_ff_reader(reader, field, Some(get_numeric_or_date_column_types()))
+                    })
+                    .collect::<crate::Result<_>>()?;
+
+                let value_accessors = top_hits
+                    .value_field_names()
+                    .iter()
+                    .map(|field_name| {
+                        Ok((
+                            field_name.to_string(),
+                            get_dynamic_columns(reader, field_name)?,
+                        ))
+                    })
+                    .collect::<crate::Result<_>>()?;
+
+                add_agg_with_accessors(&agg, accessors, &mut res, value_accessors)?;
             }
         };
 
@@ -284,6 +354,7 @@ fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
 pub(crate) fn get_aggs_with_segment_accessor_and_validate(
     aggs: &Aggregations,
     reader: &SegmentReader,
+    segment_ordinal: SegmentOrdinal,
     limits: &AggregationLimits,
 ) -> crate::Result<AggregationsWithAccessor> {
     let mut aggss = Vec::new();
@@ -292,6 +363,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
             agg,
             agg.sub_aggregation(),
             reader,
+            segment_ordinal,
             limits.clone(),
         )?;
         for agg in aggs {
@@ -321,6 +393,19 @@ fn get_ff_reader(
     Ok(ff_field_with_type)
 }
 
+fn get_dynamic_columns(
+    reader: &SegmentReader,
+    field_name: &str,
+) -> crate::Result<Vec<columnar::DynamicColumn>> {
+    let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?;
+    let cols = ff_fields
+        .iter()
+        .map(|h| h.open())
+        .collect::<io::Result<_>>()?;
+    assert!(!ff_fields.is_empty(), "field {} not found", field_name);
+    Ok(cols)
+}
+
 /// Get all fast field reader or empty as default.
 ///
 /// Is guaranteed to return at least one column.