Added support for exists query, as defined in Elasticsearch

Field exists does not consider types, only field names. Field capability will have to be handled differently unfortunately. This works by introducing an internal (but normal) "u64" field that stores postings list for field existence. For performance/RAM reasons, the fields full path is not stored as a string but instead we compute a u64-fnv hash using the path from root to leaf. If the hash perfects ideally, even with the anniversary attach, collisions are very unlikely. When dealing with complex JSON with the raw tokenizer this feature can double the number of tokens we deal with, and has an impact on performance. For this reason, it is not added as an option in the DocMapper. Like Elasticsearch, we only store field existence of indexed fields. Also in order to handle refinement like expand_dots, we work over the built tantivy Document and reuse the existing resolution logic. On 1.4GB of gharchive (which is close to a worst case scenaio), see the following performance/index size change: With field_exists enabled - Indexing Throughput: 41 MB/s - Index size: 701M With field_exists disabled - Indexing Throughput: 46 MB/s - Index size: 698M
quickwit-oss · Aug 4, 2023 · 04cc9d3 · 04cc9d3
1 parent 8c2caf5
commit 04cc9d3
Show file tree

Hide file tree

Showing 28 changed files with 480 additions and 35 deletions.
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
diff --git a/quickwit/quickwit-common/Cargo.toml b/quickwit/quickwit-common/Cargo.toml
@@ -16,6 +16,7 @@ async-trait = { workspace = true }
 byte-unit = { workspace = true }
 dyn-clone = { workspace = true }
 env_logger = { workspace = true }
+fnv = { workspace = true }
 futures = { workspace = true }
 home = { workspace = true }
 hostname = { workspace = true }

diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs
@@ -27,6 +27,7 @@ pub mod io;
 mod kill_switch;
 pub mod metrics;
 pub mod net;
+mod path_hasher;
 mod progress;
 pub mod pubsub;
 pub mod rand;
@@ -49,6 +50,7 @@ use std::str::FromStr;
 
 pub use coolid::new_coolid;
 pub use kill_switch::KillSwitch;
+pub use path_hasher::PathHasher;
 pub use progress::{Progress, ProtectedZoneGuard};
 pub use stream_utils::{BoxStream, ServiceStream};
 use tracing::{error, info};

diff --git a/quickwit/quickwit-common/src/path_hasher.rs b/quickwit/quickwit-common/src/path_hasher.rs
@@ -0,0 +1,59 @@
+// Copyright (C) 2023 Quickwit, Inc.
+//
+// Quickwit is offered under the AGPL v3.0 and as commercial software.
+// For commercial licensing, contact us at [email protected].
+//
+// AGPL:
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+use std::hash::Hasher;
+
+#[derive(Default)]
+pub struct PathHasher {
+    hasher: fnv::FnvHasher,
+}
+
+impl Clone for PathHasher {
+    #[inline(always)]
+    fn clone(&self) -> PathHasher {
+        PathHasher {
+            hasher: fnv::FnvHasher::with_key(self.hasher.finish()),
+        }
+    }
+}
+
+impl PathHasher {
+    pub fn hash_path(segments: &[&[u8]]) -> u64 {
+        let mut hasher = Self::default();
+        for segment in segments {
+            hasher.append(segment);
+        }
+        hasher.harvest()
+    }
+
+    /// Appends a new segment to our path.
+    ///
+    /// In order to avoid natural collisions, (e.g. &["ab", "c"] and &["a", "bc"]),
+    /// we add a null byte between each segment as a separator.
+    #[inline]
+    pub fn append(&mut self, payload: &[u8]) {
+        self.hasher.write(payload);
+        self.hasher.write(&[0u8]);
+    }
+
+    #[inline]
+    pub fn harvest(&self) -> u64 {
+        self.hasher.finish()
+    }
+}
diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs
@@ -66,6 +66,8 @@ pub struct DocMapping {
     #[serde(default)]
     pub store_source: bool,
     #[serde(default)]
+    pub field_exists: bool,
+    #[serde(default)]
     pub timestamp_field: Option<String>,
     #[serde_multikey(
         deserializer = Mode::from_parts,
@@ -433,6 +435,7 @@ impl TestableForRegression for IndexConfig {
         )
         .unwrap();
         let doc_mapping = DocMapping {
+            field_exists: true,
             field_mappings: vec![
                 tenant_id_mapping,
                 timestamp_mapping,
@@ -517,6 +520,7 @@ pub fn build_doc_mapper(
 ) -> anyhow::Result<Arc<dyn DocMapper>> {
     let builder = DefaultDocMapperBuilder {
         store_source: doc_mapping.store_source,
+        field_exists: doc_mapping.field_exists,
         default_search_fields: search_settings.default_search_fields.clone(),
         timestamp_field: doc_mapping.timestamp_field.clone(),
         field_mappings: doc_mapping.field_mappings.clone(),

diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml
@@ -32,6 +32,7 @@ utoipa = { workspace = true }
 
 quickwit-datetime = { workspace = true }
 quickwit-macros = { workspace = true }
+quickwit-common = { workspace = true }
 quickwit-query = { workspace = true }
 
 [dev-dependencies]