Add support for custom tokenizers ngram and regex. (#3575)

* Add support for custom tokenizers ngram and regex. * Update backward compatibiilty tests. * Update tantivy version and tokenizers. * Improve tests and add endpoint to analyze text. * Better error on analyze endpoint. * Fix from reviews.
quickwit-oss · Jul 4, 2023 · 50a6e71 · 50a6e71
1 parent 25c30d8
commit 50a6e71
Show file tree

Hide file tree

Showing 43 changed files with 1,147 additions and 209 deletions.
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
@@ -224,7 +224,7 @@ quickwit-serve = { version = "0.6.1", path = "./quickwit-serve" }
 quickwit-storage = { version = "0.6.1", path = "./quickwit-storage" }
 quickwit-telemetry = { version = "0.6.1", path = "./quickwit-telemetry" }
 
-tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [
+tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3c30066", default-features = false, features = [
   "mmap",
   "lz4-compression",
   "zstd-compression",

diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.json
@@ -4,6 +4,13 @@
     "index_id": "hdfs-logs",
     "index_uri": "s3://quickwit-indexes/hdfs-logs",
     "doc_mapping": {
+        "tokenizers": [
+            {
+                "name": "service_regex",
+                "type": "regex",
+                "pattern": "\\w*"
+            }
+        ],
         "field_mappings": [
             {
                 "name": "tenant_id",
@@ -33,7 +40,7 @@
                     {
                         "name": "service",
                         "type": "text",
-                        "tokenizer": "raw"
+                        "tokenizer": "service_regex"
                     }
                 ]
             }

diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.toml
@@ -3,12 +3,15 @@ index_id = "hdfs-logs"
 index_uri = "s3://quickwit-indexes/hdfs-logs"
 
 [doc_mapping]
+tokenizers = [
+  { name = "service_regex", type = "regex", pattern = "\\w*" },
+]
 field_mappings = [
   { name = "tenant_id", type = "u64", fast = true },
   { name = "timestamp", type = "datetime", fast = true },
   { name = "severity_text", type = "text", tokenizer = "raw" },
   { name = "body", type = "text", tokenizer = "default", record = "position" },
-  { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] },
+  { name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] },
 ]
 tag_fields = [ "tenant_id" ]
 store_source = true

diff --git a/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml b/quickwit/quickwit-config/resources/tests/index_config/hdfs-logs.yaml
@@ -3,6 +3,10 @@ index_id: hdfs-logs
 index_uri: s3://quickwit-indexes/hdfs-logs
 
 doc_mapping:
+  tokenizers:
+    - name: service_regex
+      type: regex
+      pattern: "\\w*"
   field_mappings:
     - name: tenant_id
       type: u64
@@ -22,7 +26,7 @@ doc_mapping:
       field_mappings:
         - name: service
           type: text
-          tokenizer: raw
+          tokenizer: service_regex
   tag_fields: [tenant_id]
   timestamp_field: timestamp
   store_source: true

diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs
@@ -33,7 +33,7 @@ use humantime::parse_duration;
 use quickwit_common::uri::Uri;
 use quickwit_doc_mapper::{
     DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, ModeType,
-    QuickwitJsonOptions,
+    QuickwitJsonOptions, TokenizerEntry,
 };
 use serde::{Deserialize, Serialize};
 pub use serialize::load_index_config_from_user_config;
@@ -76,6 +76,8 @@ pub struct DocMapping {
     #[schema(value_type = u32)]
     #[serde(default = "DefaultDocMapper::default_max_num_partitions")]
     pub max_num_partitions: NonZeroU32,
+    #[serde(default)]
+    pub tokenizers: Vec<TokenizerEntry>,
 }
 
 #[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
@@ -414,6 +416,14 @@ impl TestableForRegression for IndexConfig {
         }"#,
         )
         .unwrap();
+        let tokenizer = serde_json::from_str(
+            r#"{
+                "name": "custom_tokenizer",
+                "type": "regex",
+                "pattern": "[^\\p{L}\\p{N}]+"
+            }"#,
+        )
+        .unwrap();
         let doc_mapping = DocMapping {
             field_mappings: vec![
                 tenant_id_mapping,
@@ -431,6 +441,7 @@ impl TestableForRegression for IndexConfig {
             partition_key: Some("tenant_id".to_string()),
             max_num_partitions: NonZeroU32::new(100).unwrap(),
             timestamp_field: Some("timestamp".to_string()),
+            tokenizers: vec![tokenizer],
         };
         let retention_policy = Some(RetentionPolicy::new(
             "90 days".to_string(),
@@ -507,6 +518,7 @@ pub fn build_doc_mapper(
         dynamic_mapping: doc_mapping.dynamic_mapping.clone(),
         partition_key: doc_mapping.partition_key.clone(),
         max_num_partitions: doc_mapping.max_num_partitions,
+        tokenizers: doc_mapping.tokenizers.clone(),
     };
     Ok(Arc::new(builder.try_build()?))
 }
@@ -539,6 +551,8 @@ mod tests {
             &Uri::from_well_formed("s3://defaultbucket/"),
         )
         .unwrap();
+        assert_eq!(index_config.doc_mapping.tokenizers.len(), 1);
+        assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex");
         assert_eq!(index_config.doc_mapping.field_mappings.len(), 5);
         assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id");
         assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp");