Skip to content

Commit

Permalink
Add support for custom tokenizers ngram and regex. (#3575)
Browse files Browse the repository at this point in the history
* Add support for custom tokenizers ngram and regex.

* Update backward compatibiilty tests.

* Update tantivy version and tokenizers.

* Improve tests and add endpoint to analyze text.

* Better error on analyze endpoint.

* Fix from reviews.
  • Loading branch information
fmassot committed Jul 4, 2023
1 parent 25c30d8 commit 50a6e71
Show file tree
Hide file tree
Showing 43 changed files with 1,147 additions and 209 deletions.
29 changes: 14 additions & 15 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ quickwit-serve = { version = "0.6.1", path = "./quickwit-serve" }
quickwit-storage = { version = "0.6.1", path = "./quickwit-storage" }
quickwit-telemetry = { version = "0.6.1", path = "./quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3c30066", default-features = false, features = [
"mmap",
"lz4-compression",
"zstd-compression",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
"index_id": "hdfs-logs",
"index_uri": "s3://quickwit-indexes/hdfs-logs",
"doc_mapping": {
"tokenizers": [
{
"name": "service_regex",
"type": "regex",
"pattern": "\\w*"
}
],
"field_mappings": [
{
"name": "tenant_id",
Expand Down Expand Up @@ -33,7 +40,7 @@
{
"name": "service",
"type": "text",
"tokenizer": "raw"
"tokenizer": "service_regex"
}
]
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ index_id = "hdfs-logs"
index_uri = "s3://quickwit-indexes/hdfs-logs"

[doc_mapping]
tokenizers = [
{ name = "service_regex", type = "regex", pattern = "\\w*" },
]
field_mappings = [
{ name = "tenant_id", type = "u64", fast = true },
{ name = "timestamp", type = "datetime", fast = true },
{ name = "severity_text", type = "text", tokenizer = "raw" },
{ name = "body", type = "text", tokenizer = "default", record = "position" },
{ name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "raw" } ] },
{ name = "resource", type = "object", field_mappings = [ { name = "service", type = "text", tokenizer = "service_regex" } ] },
]
tag_fields = [ "tenant_id" ]
store_source = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ index_id: hdfs-logs
index_uri: s3://quickwit-indexes/hdfs-logs

doc_mapping:
tokenizers:
- name: service_regex
type: regex
pattern: "\\w*"
field_mappings:
- name: tenant_id
type: u64
Expand All @@ -22,7 +26,7 @@ doc_mapping:
field_mappings:
- name: service
type: text
tokenizer: raw
tokenizer: service_regex
tag_fields: [tenant_id]
timestamp_field: timestamp
store_source: true
Expand Down
16 changes: 15 additions & 1 deletion quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use humantime::parse_duration;
use quickwit_common::uri::Uri;
use quickwit_doc_mapper::{
DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, ModeType,
QuickwitJsonOptions,
QuickwitJsonOptions, TokenizerEntry,
};
use serde::{Deserialize, Serialize};
pub use serialize::load_index_config_from_user_config;
Expand Down Expand Up @@ -76,6 +76,8 @@ pub struct DocMapping {
#[schema(value_type = u32)]
#[serde(default = "DefaultDocMapper::default_max_num_partitions")]
pub max_num_partitions: NonZeroU32,
#[serde(default)]
pub tokenizers: Vec<TokenizerEntry>,
}

#[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
Expand Down Expand Up @@ -414,6 +416,14 @@ impl TestableForRegression for IndexConfig {
}"#,
)
.unwrap();
let tokenizer = serde_json::from_str(
r#"{
"name": "custom_tokenizer",
"type": "regex",
"pattern": "[^\\p{L}\\p{N}]+"
}"#,
)
.unwrap();
let doc_mapping = DocMapping {
field_mappings: vec![
tenant_id_mapping,
Expand All @@ -431,6 +441,7 @@ impl TestableForRegression for IndexConfig {
partition_key: Some("tenant_id".to_string()),
max_num_partitions: NonZeroU32::new(100).unwrap(),
timestamp_field: Some("timestamp".to_string()),
tokenizers: vec![tokenizer],
};
let retention_policy = Some(RetentionPolicy::new(
"90 days".to_string(),
Expand Down Expand Up @@ -507,6 +518,7 @@ pub fn build_doc_mapper(
dynamic_mapping: doc_mapping.dynamic_mapping.clone(),
partition_key: doc_mapping.partition_key.clone(),
max_num_partitions: doc_mapping.max_num_partitions,
tokenizers: doc_mapping.tokenizers.clone(),
};
Ok(Arc::new(builder.try_build()?))
}
Expand Down Expand Up @@ -539,6 +551,8 @@ mod tests {
&Uri::from_well_formed("s3://defaultbucket/"),
)
.unwrap();
assert_eq!(index_config.doc_mapping.tokenizers.len(), 1);
assert_eq!(index_config.doc_mapping.tokenizers[0].name, "service_regex");
assert_eq!(index_config.doc_mapping.field_mappings.len(), 5);
assert_eq!(index_config.doc_mapping.field_mappings[0].name, "tenant_id");
assert_eq!(index_config.doc_mapping.field_mappings[1].name, "timestamp");
Expand Down
Loading

0 comments on commit 50a6e71

Please sign in to comment.