diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 3996f09a6f2..8f8e6db2b6e 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -127,6 +127,7 @@ fast: | ------------- | ------------- | ------------- | | `description` | Optional description for the field. | `None` | | `stored` | Whether value is stored in the document store | `true` | +| `indexed` | Whether value should be indexed so it can be searhced | `true` | | `tokenizer` | Name of the `Tokenizer`. ([See tokenizers](#description-of-available-tokenizers)) for a list of available tokenizers. | `default` | | `record` | Describes the amount of information indexed, choices between `basic`, `freq` and `position` | `basic` | | `fieldnorms` | Whether to store fieldnorms for the field. Fieldnorms are required to calculate the BM25 Score of the document. | `false` | diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 96a991585be..6317057ef36 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -4962,6 +4962,7 @@ dependencies = [ "once_cell", "quickwit-common", "quickwit-doc-mapper", + "quickwit-macros", "regex", "serde", "serde_json", @@ -5064,6 +5065,7 @@ dependencies = [ "once_cell", "proptest", "quickwit-datetime", + "quickwit-macros", "quickwit-proto", "quickwit-query", "regex", @@ -5313,6 +5315,8 @@ version = "0.6.3" dependencies = [ "proc-macro2", "quickwit-macros-impl", + "quote", + "syn 2.0.26", ] [[package]] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 5f50810e09d..f75fcb43992 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -149,7 +149,7 @@ sqlx = { version = "0.7", features = [ "migrate", "time", ] } -syn = "2.0.11" +syn = { version = "2.0.11", features = [ "extra-traits", "full", "parsing" ]} sync_wrapper = "0.1.2" tabled = { version = "0.8", features = ["color"] } tempfile = "3" diff --git a/quickwit/quickwit-config/Cargo.toml b/quickwit/quickwit-config/Cargo.toml index 03d6b4e9c0a..bf565b8089d 100644 --- a/quickwit/quickwit-config/Cargo.toml +++ b/quickwit/quickwit-config/Cargo.toml @@ -34,6 +34,7 @@ vrl-stdlib = { workspace = true, optional=true } quickwit-common = { workspace = true } quickwit-doc-mapper = { workspace = true } +quickwit-macros = { workspace = true } [dev-dependencies] tokio = { workspace = true } diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index ac290688431..547781a3ff0 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -32,7 +32,7 @@ use cron::Schedule; use humantime::parse_duration; use quickwit_common::uri::Uri; use quickwit_doc_mapper::{ - DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, ModeType, + DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, FieldMappingEntry, Mode, ModeType, QuickwitJsonOptions, TokenizerEntry, }; use serde::{Deserialize, Serialize}; @@ -44,9 +44,10 @@ use crate::TestableForRegression; // Note(fmassot): `DocMapping` is a struct only used for // serialization/deserialization of `DocMapper` parameters. -// This is partly a duplicate of the `DocMapper` and can -// be viewed as a temporary hack for 0.2 release before +// This is partly a duplicate of the `DefaultDocMapper` and +// can be viewed as a temporary hack for 0.2 release before // refactoring. +#[quickwit_macros::serde_multikey] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, utoipa::ToSchema)] #[serde(deny_unknown_fields)] pub struct DocMapping { @@ -66,10 +67,17 @@ pub struct DocMapping { pub store_source: bool, #[serde(default)] pub timestamp_field: Option, - #[serde(default)] - pub mode: ModeType, - #[serde(skip_serializing_if = "Option::is_none")] - pub dynamic_mapping: Option, + #[serde_multikey( + deserializer = Mode::from_parts, + serializer = Mode::into_parts, + fields = ( + #[serde(default)] + mode: ModeType, + #[serde(skip_serializing_if = "Option::is_none")] + dynamic_mapping: Option + ), + )] + pub mode: Mode, #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub partition_key: Option, @@ -436,8 +444,7 @@ impl TestableForRegression for IndexConfig { .map(|tag_field| tag_field.to_string()) .collect::>(), store_source: true, - mode: ModeType::Dynamic, - dynamic_mapping: None, + mode: Mode::default(), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), timestamp_field: Some("timestamp".to_string()), @@ -514,8 +521,7 @@ pub fn build_doc_mapper( timestamp_field: doc_mapping.timestamp_field.clone(), field_mappings: doc_mapping.field_mappings.clone(), tag_fields: doc_mapping.tag_fields.iter().cloned().collect(), - mode: doc_mapping.mode, - dynamic_mapping: doc_mapping.dynamic_mapping.clone(), + mode: doc_mapping.mode.clone(), partition_key: doc_mapping.partition_key.clone(), max_num_partitions: doc_mapping.max_num_partitions, tokenizers: doc_mapping.tokenizers.clone(), @@ -713,7 +719,10 @@ mod tests { &Uri::from_well_formed("s3://my-index"), ) .unwrap(); - assert_eq!(minimal_config.doc_mapping.mode, ModeType::Dynamic); + assert_eq!( + minimal_config.doc_mapping.mode.mode_type(), + ModeType::Dynamic + ); } #[test] diff --git a/quickwit/quickwit-config/src/source_config/mod.rs b/quickwit/quickwit-config/src/source_config/mod.rs index 41a34a14d2f..fa736fa94b3 100644 --- a/quickwit/quickwit-config/src/source_config/mod.rs +++ b/quickwit/quickwit-config/src/source_config/mod.rs @@ -165,7 +165,7 @@ impl TestableForRegression for SourceConfig { }), transform_config: Some(TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: None, + timezone: default_timezone(), }), input_format: SourceInputFormat::Json, } @@ -413,9 +413,12 @@ pub struct TransformConfig { /// Timezone used in the VRL [`Program`](vrl::compiler::Program) for date and time /// manipulations. Defaults to `UTC` if not timezone is specified. - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(rename = "timezone")] - timezone_opt: Option, + #[serde(default = "default_timezone")] + timezone: String, +} + +fn default_timezone() -> String { + "UTC".to_string() } impl TransformConfig { @@ -424,7 +427,7 @@ impl TransformConfig { pub fn new(vrl_script: String, timezone_opt: Option) -> Self { Self { vrl_script, - timezone_opt, + timezone: timezone_opt.unwrap_or_else(default_timezone), } } @@ -450,11 +453,11 @@ impl TransformConfig { &self, ) -> anyhow::Result<(vrl::compiler::Program, vrl::compiler::TimeZone)> { use anyhow::Context; - let timezone_str = self.timezone_opt.as_deref().unwrap_or("UTC"); - let timezone = vrl::compiler::TimeZone::parse(timezone_str).with_context(|| { + let timezone = vrl::compiler::TimeZone::parse(&self.timezone).with_context(|| { format!( - "Failed to parse timezone: `{timezone_str}`. Timezone must be a valid name \ - in the TZ database: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones" + "Failed to parse timezone: `{}`. Timezone must be a valid name \ + in the TZ database: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones", + self.timezone, ) })?; // Append "\n." to the script to return the entire document and not only the modified @@ -487,7 +490,7 @@ impl TransformConfig { pub fn for_test(vrl_script: &str) -> Self { Self { vrl_script: vrl_script.to_string(), - timezone_opt: None, + timezone: default_timezone(), } } } @@ -532,7 +535,7 @@ mod tests { }), transform_config: Some(TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("local".to_string()), + timezone: "local".to_string(), }), input_format: SourceInputFormat::Json, }; @@ -628,7 +631,7 @@ mod tests { }), transform_config: Some(TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("local".to_string()), + timezone: "local".to_string(), }), input_format: SourceInputFormat::Json, }; @@ -1032,7 +1035,7 @@ mod tests { source_params: SourceParams::IngestApi, transform_config: Some(TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: None, + timezone: default_timezone(), }), input_format: SourceInputFormat::Json, }; @@ -1045,7 +1048,7 @@ mod tests { { let transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("local".to_string()), + timezone: "local".to_string(), }; let transform_config_yaml = serde_yaml::to_string(&transform_config).unwrap(); assert_eq!( @@ -1056,7 +1059,7 @@ mod tests { { let transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: None, + timezone: default_timezone(), }; let transform_config_yaml = serde_yaml::to_string(&transform_config).unwrap(); assert_eq!( @@ -1077,7 +1080,7 @@ mod tests { let expected_transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: None, + timezone: default_timezone(), }; assert_eq!(transform_config, expected_transform_config); } @@ -1091,7 +1094,7 @@ mod tests { let expected_transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("Turkey".to_string()), + timezone: "Turkey".to_string(), }; assert_eq!(transform_config, expected_transform_config); } @@ -1103,7 +1106,7 @@ mod tests { { let transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("Turkey".to_string()), + timezone: "Turkey".to_string(), }; transform_config.compile_vrl_script().unwrap(); } @@ -1116,14 +1119,14 @@ mod tests { .message = downcase(string!(.message)) "# .to_string(), - timezone_opt: None, + timezone: default_timezone(), }; transform_config.compile_vrl_script().unwrap(); } { let transform_config = TransformConfig { vrl_script: ".message = downcase(string!(.message))".to_string(), - timezone_opt: Some("foo".to_string()), + timezone: "foo".to_string(), }; let error = transform_config.compile_vrl_script().unwrap_err(); assert!(error.to_string().starts_with("Failed to parse timezone")); @@ -1131,7 +1134,7 @@ mod tests { { let transform_config = TransformConfig { vrl_script: "foo".to_string(), - timezone_opt: Some("Turkey".to_string()), + timezone: "Turkey".to_string(), }; let error = transform_config.compile_vrl_script().unwrap_err(); assert!(error.to_string().starts_with("Failed to compile")); diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index b1f7ced9811..85900ecd303 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -31,6 +31,7 @@ typetag = { workspace = true } utoipa = { workspace = true } quickwit-datetime = { workspace = true } +quickwit-macros = { workspace = true } quickwit-query = { workspace = true } [dev-dependencies] diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs index b6b2add821d..514f1cd4a06 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper.rs @@ -39,29 +39,10 @@ use crate::doc_mapper::{JsonObject, Partition}; use crate::query_builder::build_query; use crate::routing_expression::RoutingExpr; use crate::{ - Cardinality, DocMapper, DocParsingError, ModeType, QueryParserError, TokenizerEntry, - WarmupInfo, DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, + Cardinality, DocMapper, DocParsingError, Mode, QueryParserError, TokenizerEntry, WarmupInfo, + DYNAMIC_FIELD_NAME, SOURCE_FIELD_NAME, }; -/// Defines how an unmapped field should be handled. -#[derive(Clone, Debug, Serialize, Deserialize, Default)] -pub(crate) enum Mode { - #[default] - Lenient, - Strict, - Dynamic(QuickwitJsonOptions), -} - -impl Mode { - pub fn mode_type(&self) -> ModeType { - match self { - Mode::Lenient => ModeType::Lenient, - Mode::Strict => ModeType::Strict, - Mode::Dynamic(_) => ModeType::Dynamic, - } - } -} - /// Default [`DocMapper`] implementation /// which defines a set of rules to map json fields /// to tantivy index fields. @@ -149,7 +130,6 @@ impl TryFrom for DefaultDocMapper { type Error = anyhow::Error; fn try_from(builder: DefaultDocMapperBuilder) -> anyhow::Result { - let mode = builder.mode()?; let mut schema_builder = Schema::builder(); let field_mappings = build_mapping_tree(&builder.field_mappings, &mut schema_builder)?; let source_field = if builder.store_source { @@ -162,7 +142,7 @@ impl TryFrom for DefaultDocMapper { validate_timestamp_field(timestamp_field_path, &field_mappings)?; }; - let dynamic_field = if let Mode::Dynamic(json_options) = &mode { + let dynamic_field = if let Mode::Dynamic(json_options) = &builder.mode { Some(schema_builder.add_json_field(DYNAMIC_FIELD_NAME, json_options.clone())) } else { None @@ -255,7 +235,7 @@ impl TryFrom for DefaultDocMapper { required_fields, partition_key, max_num_partitions: builder.max_num_partitions, - mode, + mode: builder.mode, tokenizer_entries: builder.tokenizers, tokenizer_manager, }) @@ -338,11 +318,6 @@ fn validate_fields_tokenizers( impl From for DefaultDocMapperBuilder { fn from(default_doc_mapper: DefaultDocMapper) -> Self { - let mode = default_doc_mapper.mode.mode_type(); - let dynamic_mapping: Option = match &default_doc_mapper.mode { - Mode::Dynamic(mapping_options) => Some(mapping_options.clone()), - _ => None, - }; let partition_key_str = default_doc_mapper.partition_key.to_string(); let partition_key_opt: Option = if partition_key_str.is_empty() { None @@ -357,8 +332,7 @@ impl From for DefaultDocMapperBuilder { field_mappings: default_doc_mapper.field_mappings.into(), tag_fields: default_doc_mapper.tag_field_names.into_iter().collect(), default_search_fields: default_doc_mapper.default_search_field_names, - mode, - dynamic_mapping, + mode: default_doc_mapper.mode, partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, tokenizers: default_doc_mapper.tokenizer_entries, @@ -1545,8 +1519,8 @@ mod tests { .unwrap(); match &field_mapping_type { super::FieldMappingType::Text(options, _) => { - assert!(options.tokenizer.is_some()); - let tokenizer = options.tokenizer.as_ref().unwrap(); + assert!(options.indexing_options.is_some()); + let tokenizer = &options.indexing_options.as_ref().unwrap().tokenizer; assert_eq!(tokenizer.name(), "my_tokenizer"); } _ => panic!("Expected a text field"), diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs index 409307d4680..98821da874d 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/default_mapper_builder.rs @@ -19,12 +19,10 @@ use std::num::NonZeroU32; -use anyhow::bail; use serde::{Deserialize, Serialize}; use super::tokenizer_entry::TokenizerEntry; use super::FieldMappingEntry; -use crate::default_doc_mapper::default_mapper::Mode; use crate::default_doc_mapper::QuickwitJsonOptions; use crate::DefaultDocMapper; @@ -34,6 +32,7 @@ use crate::DefaultDocMapper; /// It is also used to serialize/deserialize a DocMapper. /// note that this is not the way is the DocMapping is deserialized /// from the configuration. +#[quickwit_macros::serde_multikey] #[derive(Serialize, Deserialize, Clone)] #[serde(deny_unknown_fields)] pub struct DefaultDocMapperBuilder { @@ -60,18 +59,81 @@ pub struct DefaultDocMapperBuilder { /// Maximum number of partitions. #[serde(default = "DefaultDocMapper::default_max_num_partitions")] pub max_num_partitions: NonZeroU32, - /// Defines the indexing mode. - #[serde(default)] - pub mode: ModeType, - /// If mode is set to dynamic, `dynamic_mapping` defines - /// how the unmapped fields should be handled. - #[serde(default)] - pub dynamic_mapping: Option, + #[serde_multikey( + deserializer = Mode::from_parts, + serializer = Mode::into_parts, + fields = ( + /// Defines the indexing mode. + #[serde(default)] + mode: ModeType, + /// If mode is set to dynamic, `dynamic_mapping` defines + /// how the unmapped fields should be handled. + #[serde(default)] + dynamic_mapping: Option, + ), + )] + /// Defines how the unmapped fields should be handled. + pub mode: Mode, /// User-defined tokenizers. #[serde(default)] pub tokenizers: Vec, } +/// Defines how an unmapped field should be handled. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub enum Mode { + /// Lenient mode: unmapped fields are just ignored. + Lenient, + /// Strict mode: when parsing a document with an unmapped field, an error is yielded. + Strict, + /// Dynamic mode: unmapped fields are captured and handled according to the provided + /// configuration. + Dynamic(QuickwitJsonOptions), +} + +impl Mode { + /// Extact the `ModeType` of this `Mode` + pub fn mode_type(&self) -> ModeType { + match self { + Mode::Lenient => ModeType::Lenient, + Mode::Strict => ModeType::Strict, + Mode::Dynamic(_) => ModeType::Dynamic, + } + } + + /// Build a Mode from its type and optional dynamic mapping options + pub fn from_parts( + mode: ModeType, + dynamic_mapping: Option, + ) -> anyhow::Result { + Ok(match (mode, dynamic_mapping) { + (ModeType::Lenient, None) => Mode::Lenient, + (ModeType::Strict, None) => Mode::Strict, + (ModeType::Dynamic, Some(dynamic_mapping)) => Mode::Dynamic(dynamic_mapping), + (ModeType::Dynamic, None) => Mode::default(), // Dynamic with default options + (_, Some(_)) => anyhow::bail!( + "`dynamic_mapping` is only allowed with mode=dynamic. (Here mode=`{:?}`)", + mode + ), + }) + } + + /// Obtain the mode type and dynamic options from a Mode + pub fn into_parts(self) -> (ModeType, Option) { + match self { + Mode::Lenient => (ModeType::Lenient, None), + Mode::Strict => (ModeType::Strict, None), + Mode::Dynamic(json_options) => (ModeType::Dynamic, Some(json_options)), + } + } +} + +impl Default for Mode { + fn default() -> Self { + Mode::Dynamic(QuickwitJsonOptions::default_dynamic()) + } +} + /// `Mode` describing how the unmapped field should be handled. #[derive(Clone, Copy, Default, Debug, Eq, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] #[serde(rename_all = "lowercase")] @@ -93,35 +155,7 @@ impl Default for DefaultDocMapperBuilder { } } -// By default, in dynamic mode, all fields are fast fields. -fn default_dynamic_mapping() -> QuickwitJsonOptions { - QuickwitJsonOptions { - fast: super::FastFieldOptions::EnabledWithNormalizer { - normalizer: super::QuickwitTextNormalizer::Raw, - }, - ..Default::default() - } -} - impl DefaultDocMapperBuilder { - pub(crate) fn mode(&self) -> anyhow::Result { - if self.mode != ModeType::Dynamic && self.dynamic_mapping.is_some() { - bail!( - "`dynamic_mapping` is only allowed with mode=dynamic. (Here mode=`{:?}`)", - self.mode - ); - } - Ok(match self.mode { - ModeType::Lenient => Mode::Lenient, - ModeType::Strict => Mode::Strict, - ModeType::Dynamic => Mode::Dynamic( - self.dynamic_mapping - .clone() - .unwrap_or_else(default_dynamic_mapping), - ), - }) - } - /// Build a valid `DefaultDocMapper`. /// This will consume your `DefaultDocMapperBuilder`. pub fn try_build(self) -> anyhow::Result { @@ -140,8 +174,7 @@ mod tests { assert!(default_mapper_builder.default_search_fields.is_empty()); assert!(default_mapper_builder.field_mappings.is_empty()); assert!(default_mapper_builder.tag_fields.is_empty()); - assert_eq!(default_mapper_builder.mode, ModeType::Dynamic); - assert!(default_mapper_builder.dynamic_mapping.is_none()); + assert_eq!(default_mapper_builder.mode.mode_type(), ModeType::Dynamic); assert_eq!(default_mapper_builder.store_source, false); assert!(default_mapper_builder.timestamp_field.is_none()); } diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index d1f4866fdb8..b342997be51 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -253,6 +253,106 @@ impl QuickwitTextNormalizer { } } +#[derive(Clone, PartialEq, Debug)] +pub struct TextIndexingOptions { + pub tokenizer: QuickwitTextTokenizer, + pub record: IndexRecordOption, + pub fieldnorms: bool, +} + +impl TextIndexingOptions { + fn from_parts_text( + indexed: bool, + tokenizer: Option, + record: Option, + fieldnorms: bool, + ) -> anyhow::Result> { + if indexed { + Ok(Some(TextIndexingOptions { + tokenizer: tokenizer.unwrap_or_default(), + record: record.unwrap_or(IndexRecordOption::Basic), + fieldnorms, + })) + } else { + if tokenizer.is_some() || record.is_some() || fieldnorms { + bail!( + "`record`, `tokenizer`, and `fieldnorms` parameters are allowed only if \ + indexed is true." + ) + } + Ok(None) + } + } + + fn from_parts_json( + indexed: bool, + tokenizer: Option, + record: Option, + ) -> anyhow::Result> { + if indexed { + Ok(Some(TextIndexingOptions { + tokenizer: tokenizer.unwrap_or_else(QuickwitTextTokenizer::raw), + record: record.unwrap_or(IndexRecordOption::Basic), + fieldnorms: false, + })) + } else { + if tokenizer.is_some() || record.is_some() { + bail!("`record` and `tokenizer` parameters are allowed only if indexed is true.") + } + Ok(None) + } + } + + fn to_parts_text( + this: Option, + ) -> ( + bool, // indexed + Option, + Option, + bool, // fieldnorms + ) { + match this { + Some(this) => ( + true, + Some(this.tokenizer), + Some(this.record), + this.fieldnorms, + ), + None => (false, None, None, false), + } + } + + fn to_parts_json( + this: Option, + ) -> ( + bool, // indexed + Option, + Option, + ) { + let (indexed, tokenizer, record, _fieldorm) = TextIndexingOptions::to_parts_text(this); + (indexed, tokenizer, record) + } + + fn default_json() -> Self { + TextIndexingOptions { + tokenizer: QuickwitTextTokenizer::raw(), + record: IndexRecordOption::Basic, + fieldnorms: false, + } + } +} + +impl Default for TextIndexingOptions { + fn default() -> Self { + TextIndexingOptions { + tokenizer: QuickwitTextTokenizer::default(), + record: IndexRecordOption::Basic, + fieldnorms: false, + } + } +} + +#[quickwit_macros::serde_multikey] #[derive(Clone, PartialEq, Serialize, Deserialize, Debug, utoipa::ToSchema)] #[serde(deny_unknown_fields)] pub struct QuickwitTextOptions { @@ -260,17 +360,24 @@ pub struct QuickwitTextOptions { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, - #[serde(default = "default_as_true")] - pub indexed: bool, - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub tokenizer: Option, - #[schema(value_type = IndexRecordOptionSchema)] - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub record: Option, - #[serde(default)] - pub fieldnorms: bool, + #[serde_multikey( + deserializer = TextIndexingOptions::from_parts_text, + serializer = TextIndexingOptions::to_parts_text, + fields = ( + #[serde(default = "default_as_true")] + pub indexed: bool, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tokenizer: Option, + #[schema(value_type = IndexRecordOptionSchema)] + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub record: Option, + #[serde(default)] + pub fieldnorms: bool, + ), + )] + pub indexing_options: Option, #[serde(default = "default_as_true")] pub stored: bool, #[serde(default)] @@ -290,6 +397,14 @@ pub enum FastFieldOptions { }, } +impl FastFieldOptions { + pub fn default_enabled() -> Self { + FastFieldOptions::EnabledWithNormalizer { + normalizer: QuickwitTextNormalizer::Raw, + } + } +} + #[derive(Serialize, Deserialize)] #[serde(untagged)] enum FastFieldOptionsForSerialization { @@ -302,9 +417,7 @@ impl From for FastFieldOptions { match fast_field_options { FastFieldOptionsForSerialization::IsEnabled(is_enabled) => { if is_enabled { - FastFieldOptions::EnabledWithNormalizer { - normalizer: QuickwitTextNormalizer::Raw, - } + FastFieldOptions::default_enabled() } else { FastFieldOptions::Disabled } @@ -331,10 +444,7 @@ impl Default for QuickwitTextOptions { fn default() -> Self { Self { description: None, - indexed: true, - tokenizer: None, - record: None, - fieldnorms: false, + indexing_options: Some(TextIndexingOptions::default()), stored: true, fast: FastFieldOptions::default(), } @@ -353,17 +463,11 @@ impl From for TextOptions { } FastFieldOptions::Disabled => {} } - if quickwit_text_options.indexed { - let index_record_option = quickwit_text_options - .record - .unwrap_or(IndexRecordOption::Basic); - let tokenizer = quickwit_text_options - .tokenizer - .unwrap_or(QuickwitTextTokenizer::default()); + if let Some(indexing_options) = quickwit_text_options.indexing_options { let text_field_indexing = TextFieldIndexing::default() - .set_index_option(index_record_option) - .set_fieldnorms(quickwit_text_options.fieldnorms) - .set_tokenizer(tokenizer.name()); + .set_index_option(indexing_options.record) + .set_fieldnorms(indexing_options.fieldnorms) + .set_tokenizer(indexing_options.tokenizer.name()); text_options = text_options.set_indexing_options(text_field_indexing); } @@ -391,6 +495,7 @@ pub enum IndexRecordOptionSchema { /// /// `QuickwitJsonOptions` is also used to configure /// the dynamic mapping. +#[quickwit_macros::serde_multikey] #[derive(Clone, Debug, PartialEq, Serialize, Deserialize, utoipa::ToSchema)] #[serde(deny_unknown_fields)] pub struct QuickwitJsonOptions { @@ -398,22 +503,30 @@ pub struct QuickwitJsonOptions { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub description: Option, - /// If true, all of the element in the json object will be indexed. - #[serde(default = "default_as_true")] - pub indexed: bool, - /// Sets the tokenize that should be used with the text fields in the - /// json object. - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub tokenizer: Option, - /// Sets how much information should be added in the index - /// with each token. - /// - /// Setting `record` is only allowed if indexed == true. - #[schema(value_type = IndexRecordOptionSchema)] - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub record: Option, + #[serde_multikey( + deserializer = TextIndexingOptions::from_parts_json, + serializer = TextIndexingOptions::to_parts_json, + fields = ( + /// If true, all of the element in the json object will be indexed. + #[serde(default = "default_as_true")] + pub indexed: bool, + /// Sets the tokenize that should be used with the text fields in the + /// json object. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tokenizer: Option, + /// Sets how much information should be added in the index + /// with each token. + /// + /// Setting `record` is only allowed if indexed == true. + #[schema(value_type = IndexRecordOptionSchema)] + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub record: Option, + ), + )] + /// Options for indexing text in a Json field. + pub indexing_options: Option, /// If true, the field will be stored in the doc store. #[serde(default = "default_as_true")] pub stored: bool, @@ -425,13 +538,21 @@ pub struct QuickwitJsonOptions { pub fast: FastFieldOptions, } +impl QuickwitJsonOptions { + /// Build a default QuickwitJsonOptions for dynamic fields. + pub fn default_dynamic() -> Self { + QuickwitJsonOptions { + fast: FastFieldOptions::default_enabled(), + ..Default::default() + } + } +} + impl Default for QuickwitJsonOptions { fn default() -> Self { QuickwitJsonOptions { description: None, - indexed: true, - tokenizer: None, - record: None, + indexing_options: Some(TextIndexingOptions::default_json()), stored: true, expand_dots: true, fast: FastFieldOptions::default(), @@ -445,16 +566,10 @@ impl From for JsonObjectOptions { if quickwit_json_options.stored { json_options = json_options.set_stored(); } - if quickwit_json_options.indexed { - let index_record_option = quickwit_json_options - .record - .unwrap_or(IndexRecordOption::Basic); - let tokenizer = quickwit_json_options - .tokenizer - .unwrap_or(QuickwitTextTokenizer::raw()); + if let Some(indexing_options) = quickwit_json_options.indexing_options { let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer(tokenizer.name()) - .set_index_option(index_record_option); + .set_tokenizer(indexing_options.tokenizer.name()) + .set_index_option(indexing_options.record); json_options = json_options.set_indexing_options(text_field_indexing); } if quickwit_json_options.expand_dots { @@ -488,18 +603,6 @@ fn deserialize_mapping_type( match typ { Type::Str => { let text_options: QuickwitTextOptions = serde_json::from_value(json)?; - #[allow(clippy::collapsible_if)] - if !text_options.indexed { - if text_options.tokenizer.is_some() - || text_options.record.is_some() - || text_options.fieldnorms - { - bail!( - "`record`, `tokenizer`, and `fieldnorms` parameters are allowed only if \ - indexed is true." - ); - } - } Ok(FieldMappingType::Text(text_options, cardinality)) } Type::U64 => { @@ -536,14 +639,6 @@ fn deserialize_mapping_type( } Type::Json => { let json_options: QuickwitJsonOptions = serde_json::from_value(json)?; - #[allow(clippy::collapsible_if)] - if !json_options.indexed { - if json_options.tokenizer.is_some() || json_options.record.is_some() { - bail!( - "`record` and `tokenizer` parameters are allowed only if indexed is true." - ); - } - } Ok(FieldMappingType::Json(json_options, cardinality)) } } @@ -625,7 +720,7 @@ mod tests { use super::FieldMappingEntry; use crate::default_doc_mapper::field_mapping_entry::{ - QuickwitJsonOptions, QuickwitTextOptions, QuickwitTextTokenizer, + QuickwitJsonOptions, QuickwitTextOptions, TextIndexingOptions, }; use crate::default_doc_mapper::{FastFieldOptions, FieldMappingType}; use crate::Cardinality; @@ -681,8 +776,7 @@ mod tests { match mapping_entry.mapping_type { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); - assert_eq!(options.indexed, false); - assert_eq!(options.record.is_some(), false); + assert!(options.indexing_options.is_none()); } _ => panic!("wrong property type"), } @@ -725,8 +819,7 @@ mod tests { match mapping_entry.mapping_type { FieldMappingType::Json(options, _) => { assert_eq!(options.stored, true); - assert_eq!(options.indexed, false); - assert_eq!(options.record.is_some(), false); + assert!(options.indexing_options.is_none()); } _ => panic!("wrong property type"), } @@ -812,9 +905,9 @@ mod tests { match mapping_entry.mapping_type { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); - assert_eq!(options.indexed, true); - assert_eq!(options.tokenizer.unwrap().name(), "en_stem"); - assert_eq!(options.record.unwrap(), IndexRecordOption::Basic); + let indexing_options = options.indexing_options.unwrap(); + assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), } @@ -838,8 +931,8 @@ mod tests { match result.unwrap().mapping_type { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); - assert_eq!(options.indexed, true); - assert_eq!(options.fieldnorms, true); + let indexing_options = options.indexing_options.unwrap(); + assert_eq!(indexing_options.fieldnorms, true); } _ => panic!("wrong property type"), } @@ -1237,6 +1330,8 @@ mod tests { "fast": false, "stored": true, "indexed": true, + "record": "basic", + "tokenizer": "default", "fieldnorms": false, }) ); @@ -1263,6 +1358,8 @@ mod tests { "fast": {"normalizer": "lowercase"}, "stored": true, "indexed": true, + "record": "basic", + "tokenizer": "default", "fieldnorms": false, }) ); @@ -1287,6 +1384,8 @@ mod tests { "type": "array", "stored": true, "indexed": true, + "record": "basic", + "tokenizer": "default", "fieldnorms": false, "fast": false, }) @@ -1436,9 +1535,7 @@ mod tests { .unwrap(); let expected_json_options = QuickwitJsonOptions { description: None, - indexed: true, - tokenizer: None, - record: None, + indexing_options: Some(TextIndexingOptions::default_json()), stored: true, fast: FastFieldOptions::Disabled, expand_dots: true, @@ -1453,7 +1550,14 @@ mod tests { #[test] fn test_quickwit_json_options_default_tokenizer_is_default() { let quickwit_json_options = QuickwitJsonOptions::default(); - assert_eq!(quickwit_json_options.tokenizer, None); + assert_eq!( + quickwit_json_options + .indexing_options + .unwrap() + .tokenizer + .name(), + "raw" + ); } #[test] @@ -1478,9 +1582,7 @@ mod tests { .unwrap(); let expected_json_options = QuickwitJsonOptions { description: None, - indexed: true, - tokenizer: Some(QuickwitTextTokenizer::raw()), - record: None, + indexing_options: Some(TextIndexingOptions::default_json()), stored: false, expand_dots: true, fast: FastFieldOptions::Disabled, @@ -1540,6 +1642,8 @@ mod tests { "fast": false, "stored": true, "indexed": true, + "record": "basic", + "tokenizer": "default", "fieldnorms": false, }) ); @@ -1565,6 +1669,8 @@ mod tests { "type": "json", "stored": true, "indexed": true, + "tokenizer": "raw", + "record": "basic", "fast": false, "expand_dots": true, }) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs index 77aa2a508ba..23b96effbb1 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/mod.rs @@ -30,10 +30,10 @@ use once_cell::sync::Lazy; use regex::Regex; pub use self::default_mapper::DefaultDocMapper; -pub use self::default_mapper_builder::{DefaultDocMapperBuilder, ModeType}; +pub use self::default_mapper_builder::{DefaultDocMapperBuilder, Mode, ModeType}; pub use self::field_mapping_entry::{ FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, - QuickwitNumericOptions, QuickwitTextNormalizer, QuickwitTextOptions, + QuickwitNumericOptions, QuickwitTextNormalizer, QuickwitTextOptions, TextIndexingOptions, }; pub(crate) use self::field_mapping_entry::{ FieldMappingEntryForSerialization, IndexRecordOptionSchema, QuickwitTextTokenizer, diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs index b867d1fdab5..8df827b0e9c 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper.rs @@ -230,12 +230,12 @@ mod tests { use quickwit_query::query_ast::{query_ast_from_user_text, UserInputQuery}; use quickwit_query::BooleanOperand; - use tantivy::schema::{Field, FieldType, Term}; + use tantivy::schema::{Field, FieldType, IndexRecordOption, Term}; - use crate::default_doc_mapper::{FieldMappingType, QuickwitJsonOptions}; + use crate::default_doc_mapper::{FieldMappingType, QuickwitJsonOptions, TextIndexingOptions}; use crate::{ Cardinality, DefaultDocMapper, DefaultDocMapperBuilder, DocMapper, DocParsingError, - FieldMappingEntry, ModeType, TermRange, WarmupInfo, DYNAMIC_FIELD_NAME, + FieldMappingEntry, Mode, TermRange, WarmupInfo, DYNAMIC_FIELD_NAME, }; const JSON_DEFAULT_DOC_MAPPER: &str = r#" @@ -366,7 +366,7 @@ mod tests { #[test] fn test_doc_mapper_query_with_json_field_default_search_fields() { let doc_mapper: DefaultDocMapper = DefaultDocMapperBuilder { - mode: ModeType::Dynamic, + mode: Mode::default(), ..Default::default() } .try_build() @@ -385,7 +385,7 @@ mod tests { #[test] fn test_doc_mapper_query_with_json_field_ambiguous_term() { let doc_mapper: DefaultDocMapper = DefaultDocMapperBuilder { - mode: ModeType::Dynamic, + mode: Mode::default(), ..Default::default() } .try_build() @@ -540,7 +540,11 @@ mod tests { name: "multilang".to_string(), mapping_type: FieldMappingType::Text( QuickwitTextOptions { - tokenizer: Some(QuickwitTextTokenizer::from_static("multilang")), + indexing_options: Some(TextIndexingOptions { + tokenizer: QuickwitTextTokenizer::from_static("multilang"), + record: IndexRecordOption::Basic, + fieldnorms: false, + }), ..Default::default() }, Cardinality::SingleValue, diff --git a/quickwit/quickwit-doc-mapper/src/lib.rs b/quickwit/quickwit-doc-mapper/src/lib.rs index de7410c3435..ec6573a239b 100644 --- a/quickwit/quickwit-doc-mapper/src/lib.rs +++ b/quickwit/quickwit-doc-mapper/src/lib.rs @@ -35,7 +35,7 @@ mod routing_expression; pub mod tag_pruning; pub use default_doc_mapper::{ - analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, ModeType, + analyze_text, DefaultDocMapper, DefaultDocMapperBuilder, FieldMappingEntry, Mode, ModeType, QuickwitJsonOptions, TokenizerConfig, TokenizerEntry, }; use default_doc_mapper::{ diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index f21d88847f5..63a558e552e 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -703,6 +703,9 @@ fn parse_client_log_level(client_log_level: Option) -> anyhow::Result RDKafkaLogLevel::Info, Some("warn") | Some("warning") => RDKafkaLogLevel::Warning, Some("error") => RDKafkaLogLevel::Error, + Some("critical") => RDKafkaLogLevel::Critical, + Some("alert") => RDKafkaLogLevel::Alert, + Some("emerg") => RDKafkaLogLevel::Emerg, Some(level) => bail!( "Failed to parse Kafka client log level. Value `{}` is not supported.", level diff --git a/quickwit/quickwit-macros/Cargo.toml b/quickwit/quickwit-macros/Cargo.toml index 28ad213b609..69e5eeb142a 100644 --- a/quickwit/quickwit-macros/Cargo.toml +++ b/quickwit/quickwit-macros/Cargo.toml @@ -14,5 +14,7 @@ proc-macro = true [dependencies] proc-macro2 = { workspace = true } +quote = { workspace = true } +syn = { workspace = true } quickwit-macros-impl = { workspace = true } diff --git a/quickwit/quickwit-macros/src/lib.rs b/quickwit/quickwit-macros/src/lib.rs index 257bbdda1b9..8870aacbfed 100644 --- a/quickwit/quickwit-macros/src/lib.rs +++ b/quickwit/quickwit-macros/src/lib.rs @@ -17,10 +17,478 @@ // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . +use std::mem; + use proc_macro::TokenStream; +use proc_macro2::{Span, TokenStream as TokenStream2}; use quickwit_macros_impl::derive_prometheus_labels_impl; +use quote::quote; +use syn::parse::{Parse, ParseStream, Parser}; +use syn::punctuated::Punctuated; +use syn::{ + parenthesized, Attribute, Error, Field, Fields, FieldsNamed, Ident, ItemStruct, Meta, Path, + Token, Visibility, +}; #[proc_macro_derive(PrometheusLabels, attributes(prometheus_label))] pub fn derive_prometheus_labels(input: TokenStream) -> TokenStream { derive_prometheus_labels_impl(input.into()).into() } + +#[proc_macro_attribute] +pub fn serde_multikey(attr: TokenStream, item: TokenStream) -> TokenStream { + match serde_multikey_inner(attr, item) { + Ok(ts) => ts, + Err(e) => e.to_compile_error().into(), + } +} + +fn serde_multikey_inner(_attr: TokenStream, item: TokenStream) -> Result { + let Ok(input) = syn::parse::(item) else { + return Err(Error::new( + Span::call_site(), + "The attribute can only be applied to struct.", + )); + }; + + let main_struct = generate_main_struct(input.clone())?; + + let proxy_struct = generate_proxy_struct(input)?; + + Ok(quote!( + #main_struct + #proxy_struct + ) + .into()) +} + +/// Generate the main struct. It's a copy of the original struct, but with most +/// ser/de attributes removed, and serde try_from/into `__MultiKey{}` added. +fn generate_main_struct(mut input: ItemStruct) -> Result { + let (serialize, deserialize) = get_ser_de(&input.attrs)?; + let has_utoipa_schema = get_and_remove_utoipa_schema(&mut input.attrs)?; + + if !deserialize && !serialize { + return Err(Error::new( + Span::call_site(), + "`serde_multikey` was applied to a non Serialize/Deserialize struct.", + )); + } + + // remove serde and utoipa attributes from fields + for field in input.fields.iter_mut() { + let attrs = mem::take(&mut field.attrs); + field.attrs = attrs + .into_iter() + .filter(|attr| { + !(attr.path().is_ident("serde_multikey") + || attr.path().is_ident("serde") + || attr.path().is_ident("serde_as") + || attr.path().is_ident("schema")) + }) + .collect(); + } + + // remove serde attributes from struct + let attrs = mem::take(&mut input.attrs); + input.attrs = attrs + .into_iter() + .filter(|attr| !(attr.path().is_ident("serde") || attr.path().is_ident("serde_as"))) + .collect(); + + if deserialize { + let mut attr = Attribute::parse_outer + .parse_str(&format!( + r#"#[serde(try_from = "__MultiKey{}")]"#, + input.ident + )) + .unwrap(); + input.attrs.append(&mut attr); + } + + if serialize { + let mut attr = Attribute::parse_outer + .parse_str(&format!(r#"#[serde(into = "__MultiKey{}")]"#, input.ident)) + .unwrap(); + input.attrs.append(&mut attr); + } + + let utoipa = if has_utoipa_schema { + let main_ident = input.ident.clone(); + let main_ident_str = main_ident.to_string(); + let proxy_ident = Ident::new(&format!("__MultiKey{}", input.ident), input.ident.span()); + + Some(quote!( + impl<'__s> utoipa::ToSchema<'__s> for #main_ident { + fn schema() -> ( + &'__s str, + utoipa::openapi::RefOr, + ) { + ( + #main_ident_str, + <#proxy_ident as utoipa::ToSchema>::schema().1, + ) + } + } + )) + } else { + None + }; + + Ok(quote!( + #input + + #utoipa + )) +} + +/// Generate the proxy struct. It is a copy of the original struct, but fields marked +/// with `serde_multikey` have been replaced with the fields the correspond to. +/// Also generate TryFrom/Into as required. +fn generate_proxy_struct(mut input: ItemStruct) -> Result { + let main_ident = input.ident.clone(); + let proxy_ident = Ident::new(&format!("__MultiKey{}", input.ident), input.ident.span()); + + input.ident = proxy_ident.clone(); + input.vis = Visibility::Inherited; + // TODO wait for https://github.com/juhaku/utoipa/issues/704 to re-enable + // input.attrs.append(&mut Attribute::parse_outer + // .parse_str(&"#[doc(hidden)]") + // .unwrap()); + + let (ser, de) = get_ser_de(&input.attrs)?; + + let mut pass_through = Vec::::new(); + let mut final_fields = Punctuated::::new(); + let mut try_from_conv = Vec::::new(); + let mut into_pre_conv = Vec::::new(); + let mut into_in_conv = Vec::::new(); + + let Fields::Named(FieldsNamed { brace_token, named }) = input.fields else { + return Err(Error::new( + Span::call_site(), + "`serde_multikey` was applied to a tuple-struct or an empty struct.", + )); + }; + for pair in named.into_pairs() { + let (mut field, ponct) = pair.into_tuple(); + // we are in a "normal" struct, not a tuple-struct, unwrap is fine. + let field_name = field.ident.clone().unwrap(); + + let (field_config, attrs) = parse_attributes(field.attrs, &field_name)?; + field.attrs = attrs; + + if let Some(field_config) = field_config { + let value = Ident::new("value", Span::call_site()); + for field in &field_config.proxy_fields { + final_fields.push(field.clone()); + } + match (ser, field_config.get_into(&value)) { + (true, Some((pre_conv, in_conv))) => { + into_pre_conv.push(pre_conv); + into_in_conv.push(in_conv); + } + (false, None) => (), + (true, None) => { + return Err(Error::new( + field_name.span(), + "Structure implement Serialize but no serializer defined", + )); + } + (false, Some(_)) => { + return Err(Error::new( + field_name.span(), + "Structure doesn't implement Serialize but a serializer is defined", + )); + } + } + match (de, field_config.get_try_from(&value)) { + (true, Some(conv)) => { + try_from_conv.push(conv); + } + (false, None) => (), + (true, None) => { + return Err(Error::new( + field_name.span(), + "Structure implement Deserialize but no deserializer defined", + )); + } + (false, Some(_)) => { + return Err(Error::new( + field_name.span(), + "Structure doesn't implement Deserialize but a deserializer is defined", + )); + } + } + } else { + pass_through.push(field_name); + final_fields.push(field); + if let Some(ponct) = ponct { + final_fields.push_punct(ponct); + } + } + } + input.fields = Fields::Named(FieldsNamed { + brace_token, + named: final_fields, + }); + + let into = if ser { + Some(quote!( + impl From<#main_ident> for #proxy_ident { + fn from(value: #main_ident) -> #proxy_ident { + #(#into_pre_conv)* + #proxy_ident { + #(#pass_through: value.#pass_through,)* + #(#into_in_conv)* + } + } + } + )) + } else { + None + }; + let try_from = if de { + Some(quote!( + impl TryFrom<#proxy_ident> for #main_ident { + type Error = String; + + fn try_from(value: #proxy_ident) -> Result { + Ok(#main_ident { + #(#pass_through: value.#pass_through,)* + #(#try_from_conv)* + }) + } + } + )) + } else { + None + }; + Ok(quote!( + #input + + #into + #try_from + )) +} + +fn get_ser_de(attributes: &[Attribute]) -> Result<(bool, bool), Error> { + let mut ser = false; + let mut de = false; + + for attr in attributes { + if !attr.path().is_ident("derive") { + continue; + } + let Meta::List(ref derives) = attr.meta else { + continue; + }; + let derives = + Punctuated::::parse_terminated.parse2(derives.tokens.clone())?; + + for path in derives.iter() { + ser |= path_equiv(path, &["serde", "Serialize"]); + de |= path_equiv(path, &["serde", "Deserialize"]); + } + } + Ok((ser, de)) +} + +fn get_and_remove_utoipa_schema(attributes: &mut [Attribute]) -> Result { + let mut has_schema = false; + for attr in attributes { + if !attr.path().is_ident("derive") { + continue; + } + let Meta::List(ref mut derives) = attr.meta else { + continue; + }; + + let derive_list = + Punctuated::::parse_terminated.parse2(derives.tokens.clone())?; + let mut new_derives = Punctuated::::new(); + for path in derive_list { + if path_equiv(&path, &["utoipa", "ToSchema"]) { + has_schema = true; + } else { + new_derives.push(path); + } + } + derives.tokens = quote!(#new_derives); + } + + Ok(has_schema) +} + +fn path_equiv(path: &Path, reference: &[&str]) -> bool { + if path.segments.is_empty() || reference.is_empty() { + return false; + } + + path.segments + .iter() + .rev() + .zip(reference.iter().rev()) + .fold(true, |equal, (path_part, ref_part)| { + equal && path_part.ident == ref_part + }) +} + +#[derive(Debug)] +struct MultiKeyOptions { + main_field_name: Ident, + deserializer: Option, + serializer: Option, + proxy_fields: Vec, +} + +impl MultiKeyOptions { + fn get_into(&self, this: &Ident) -> Option<(TokenStream2, TokenStream2)> { + if let Some(ref serializer) = self.serializer { + let field_names: Vec<_> = self + .proxy_fields + .iter() + .map(|field| field.ident.clone().unwrap()) + .collect(); + let main_field_name = &self.main_field_name; + + let pre = quote!( + let (#(#field_names,)*) = #serializer(#this.#main_field_name); + ); + let in_struct = quote!( + #( + #field_names, + )* + ); + Some((pre, in_struct)) + } else { + None + } + } + + fn get_try_from(&self, this: &Ident) -> Option { + if let Some(ref deserializer) = self.deserializer { + let field_names: Vec<_> = self + .proxy_fields + .iter() + .map(|field| field.ident.clone().unwrap()) + .collect(); + let main_field_name = &self.main_field_name; + + Some(quote!( + #main_field_name: match #deserializer( #(#this.#field_names,)* ) { + Ok(val) => val, + Err(e) => return Err(e.to_string()), + }, + )) + } else { + None + } + } +} + +enum MultiKeyOption { + Deserializer(Path), + Serializer(Path), + Fields(Vec), +} + +impl Parse for MultiKeyOption { + fn parse(input: ParseStream) -> Result { + let ident: Ident = input.parse()?; + match ident.to_string().as_str() { + "serializer" => { + input.parse::()?; + Ok(MultiKeyOption::Serializer(input.parse::()?)) + } + "deserializer" => { + input.parse::()?; + Ok(MultiKeyOption::Deserializer(input.parse::()?)) + } + "fields" => { + input.parse::()?; + let content; + parenthesized!(content in input); + let fields = content.parse_terminated(Field::parse_named, Token![,])?; + Ok(MultiKeyOption::Fields(fields.into_iter().collect())) + } + _ => Err(Error::new(ident.span(), "unknown field")), + } + } +} + +impl Parse for MultiKeyOptions { + fn parse(input: ParseStream) -> Result { + let mut res = MultiKeyOptions { + main_field_name: Ident::new("tmp_name", Span::call_site()), + deserializer: None, + serializer: None, + proxy_fields: Vec::new(), + }; + + let options = Punctuated::::parse_terminated(input)?; + for option in options { + match option { + MultiKeyOption::Deserializer(path) => { + if res.deserializer.is_none() { + res.deserializer = Some(path); + } else { + todo!("throw error"); + } + } + MultiKeyOption::Serializer(path) => { + if res.serializer.is_none() { + res.serializer = Some(path); + } else { + todo!("throw error"); + } + } + MultiKeyOption::Fields(fields) => { + if res.proxy_fields.is_empty() { + res.proxy_fields = fields; + } else { + todo!("throw error"); + } + } + } + } + + if res.proxy_fields.is_empty() { + todo!("throw error") + } + + Ok(res) + } +} + +fn parse_attributes( + attributes: Vec, + field_name: &Ident, +) -> Result<(Option, Vec), Error> { + let (mut multikey_attributes, normal_attributes): (Vec<_>, _) = attributes + .into_iter() + .partition(|attr| attr.path().is_ident("serde_multikey")); + + if multikey_attributes.len() > 1 { + let last = multikey_attributes.last().unwrap(); + return Err(Error::new( + last.pound_token.spans[0], + "`serde_multikey` was applied multiple time to the same field.", + )); + } + let options = if let Some(multikey_attribute) = multikey_attributes.pop() { + let Meta::List(meta_list) = multikey_attribute.meta else { + return Err(Error::new( + multikey_attribute.pound_token.spans[0], + "`serde_multikey` require list-style arguments.", + )); + }; + let mut options: MultiKeyOptions = syn::parse2(meta_list.tokens)?; + options.main_field_name = field_name.clone(); + Some(options) + } else { + None + }; + + Ok((options, normal_attributes)) +} diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json index 3b7f0e570ae..9227720ae7e 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.4.expected.json @@ -18,6 +18,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -44,6 +54,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json index 76b7d5a7cc3..3184db35284 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.5.expected.json @@ -18,6 +18,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -44,6 +54,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -113,7 +124,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json index cde98b8990e..b6206ec38d7 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.expected.json @@ -18,6 +18,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -44,6 +54,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -120,7 +131,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json index cde98b8990e..b6206ec38d7 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.6.json @@ -18,6 +18,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -44,6 +54,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -120,7 +131,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json index c4bd9f937e9..f3620dd8787 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.4.expected.json @@ -7,6 +7,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -33,6 +43,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -102,7 +113,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json index c4bd9f937e9..f3620dd8787 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.5.expected.json @@ -7,6 +7,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -33,6 +43,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -102,7 +113,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json index b865fcb421f..db773b05b33 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.expected.json @@ -7,6 +7,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -33,6 +43,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -109,7 +120,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" } diff --git a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json index b865fcb421f..db773b05b33 100644 --- a/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json +++ b/quickwit/quickwit-metastore/test-data/index-metadata/v0.6.json @@ -7,6 +7,16 @@ "create_timestamp": 1789, "index_config": { "doc_mapping": { + "dynamic_mapping": { + "expand_dots": true, + "fast": { + "normalizer": "raw" + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" + }, "field_mappings": [ { "fast": true, @@ -33,6 +43,7 @@ "fieldnorms": false, "indexed": true, "name": "log_level", + "record": "basic", "stored": true, "tokenizer": "raw", "type": "text" @@ -109,7 +120,8 @@ "source_id": "kafka-source", "source_type": "kafka", "transform": { - "script": ".message = downcase(string!(.message))" + "script": ".message = downcase(string!(.message))", + "timezone": "UTC" }, "version": "0.6" }