From 854646d5f616e432e48dcaba8a57ea2504d77cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 26 Sep 2023 11:51:42 +0200 Subject: [PATCH] chore: change config schema --- app/config.py | 50 ++++--- app/indexing.py | 4 +- app/postprocessing.py | 2 +- app/query.py | 4 +- config_schema.json | 239 ++++++++++++++++++++++++++++++++++ data/config/openfoodfacts.yml | 232 ++++++++++++++++----------------- 6 files changed, 384 insertions(+), 147 deletions(-) create mode 100644 config_schema.json diff --git a/app/config.py b/app/config.py index d03b8e6e..f8b01095 100644 --- a/app/config.py +++ b/app/config.py @@ -2,7 +2,7 @@ from pathlib import Path import yaml -from pydantic import BaseModel, Field, HttpUrl, model_validator +from pydantic import BaseModel, Field, HttpUrl, field_validator, model_validator from pydantic_settings import BaseSettings @@ -76,8 +76,8 @@ def is_numeric(self): class FieldConfig(BaseModel): - # name of the field, must be unique across the config - name: str + # name of the field (internal field), it's added here for convenience + _name: str = "" # type of the field, see `FieldType` for possible values type: FieldType # if required=True, the field is required in the input data @@ -93,6 +93,10 @@ class FieldConfig(BaseModel): # can the keyword field contain multiple value (keyword type only) multi: bool = False + @property + def name(self): + return self._name + @model_validator(mode="after") def multi_should_be_used_for_selected_type_only(self): """Validator that checks that `multi` flag is only True for fields @@ -138,8 +142,9 @@ class IndexConfig(BaseModel): class Config(BaseModel): # configuration of the index index: IndexConfig - # configuration of all fields in the index - fields: list[FieldConfig] + # configuration of all fields in the index, keys are field names and values + # contain the field configuration + fields: dict[str, FieldConfig] split_separator: str = "," # for `text_lang` FieldType, the separator between the name of the field # and the language code, ex: product_name_it if lang_separator="_" @@ -165,7 +170,7 @@ def taxonomy_name_should_be_defined(self): """Validator that checks that for if `taxonomy_type` is defined for a field, it refers to a taxonomy defined in `taxonomy.sources`.""" defined_taxonomies = [source.name for source in self.taxonomy.sources] - for field in self.fields: + for field in self.fields.values(): if ( field.taxonomy_name is not None and field.taxonomy_name not in defined_taxonomies @@ -175,36 +180,21 @@ def taxonomy_name_should_be_defined(self): ) return self - @model_validator(mode="after") - def field_name_should_be_unique(self): - """Validator that checks that all fields have unique names.""" - seen: set[str] = set() - for field in self.fields: - if field.name in seen: - raise ValueError( - f"each field name should be unique, duplicate found: '{field.name}'" - ) - seen.add(field.name) - return self - @model_validator(mode="after") def field_references_must_exist_and_be_valid(self): """Validator that checks that every field reference in IndexConfig refers to an existing field and is valid.""" - - fields_by_name = {f.name: f for f in self.fields} - - if self.index.id_field_name not in fields_by_name: + if self.index.id_field_name not in self.fields: raise ValueError( f"id_field_name={self.index.id_field_name} but field was not declared" ) - if self.index.last_modified_field_name not in fields_by_name: + if self.index.last_modified_field_name not in self.fields: raise ValueError( f"last_modified_field_name={self.index.last_modified_field_name} but field was not declared" ) - last_modified_field = fields_by_name[self.index.last_modified_field_name] + last_modified_field = self.fields[self.index.last_modified_field_name] if last_modified_field.type != FieldType.date: raise ValueError( @@ -216,13 +206,20 @@ def field_references_must_exist_and_be_valid(self): @model_validator(mode="after") def if_split_should_be_multi(self): """Validator that checks that multi=True if split=True..""" - for field in self.fields: + for field in self.fields.values(): if field.split and not field.multi: raise ValueError("multi should be True if split=True") return self + @field_validator("fields") + @classmethod + def add_field_name_to_each_field(cls, fields): + for field_name, field_item in fields.items(): + field_item._name = field_name + return fields + def get_input_fields(self) -> set[str]: - return {field.name for field in self.fields} | { + return set(self.fields) | { field.input_field for field in self.fields if field.input_field is not None } @@ -231,6 +228,7 @@ def get_supported_langs(self) -> set[str]: @classmethod def from_yaml(cls, path: Path) -> "Config": + """Create a Config from a yaml configuration file.""" with path.open("r") as f: data = yaml.safe_load(f) return cls(**data) diff --git a/app/indexing.py b/app/indexing.py index 052aefe0..80e1388b 100644 --- a/app/indexing.py +++ b/app/indexing.py @@ -202,7 +202,7 @@ def from_dict(self, d: JSONType) -> JSONType | None: if d is None: return None - for field in self.config.fields: + for field in self.config.fields.values(): input_field = field.get_input_field() if field.type == FieldType.text_lang: @@ -234,7 +234,7 @@ def from_dict(self, d: JSONType) -> JSONType | None: def generate_mapping_object(config: Config) -> Mapping: mapping = Mapping() supported_langs = config.get_supported_langs() - for field in config.fields: + for field in config.fields.values(): mapping.field( field.name, generate_dsl_field(field, supported_langs=supported_langs) ) diff --git a/app/postprocessing.py b/app/postprocessing.py index fbf88654..711f976b 100644 --- a/app/postprocessing.py +++ b/app/postprocessing.py @@ -21,7 +21,7 @@ def process(self, response: Response, projection: set[str] | None) -> JSONType: result = hit.to_dict() result["_score"] = hit.meta.score - for field in self.config.fields: + for field in self.config.fields.values(): if field.name not in result: continue diff --git a/app/query.py b/app/query.py index 2aabe7cc..0314b9dc 100644 --- a/app/query.py +++ b/app/query.py @@ -45,7 +45,7 @@ def build_query_clause(query: str, langs: set[str], config: Config) -> Query: supported_langs = config.get_supported_langs() match_phrase_boost_queries = [] - for field in config.fields: + for field in config.fields.values(): # We don't include all fields in the multi-match clause, only a subset # of them if field.include_multi_match: @@ -139,7 +139,7 @@ def parse_sort_by_parameter(sort_by: str | None, config: Config) -> str | None: if negative_operator := sort_by.startswith("-"): sort_by = sort_by[1:] - for field in config.fields: + for field in config.fields.values(): if field.name == sort_by: if field.type is FieldType.text_lang: # use 'main' language subfield for sorting diff --git a/config_schema.json b/config_schema.json new file mode 100644 index 00000000..4eb9906f --- /dev/null +++ b/config_schema.json @@ -0,0 +1,239 @@ +{ + "$defs": { + "FieldConfig": { + "properties": { + "type": { + "$ref": "#/$defs/FieldType" + }, + "required": { + "default": false, + "title": "Required", + "type": "boolean" + }, + "input_field": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Input Field" + }, + "split": { + "default": false, + "title": "Split", + "type": "boolean" + }, + "include_multi_match": { + "default": false, + "title": "Include Multi Match", + "type": "boolean" + }, + "taxonomy_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Taxonomy Name" + }, + "multi": { + "default": false, + "title": "Multi", + "type": "boolean" + } + }, + "required": [ + "type" + ], + "title": "FieldConfig", + "type": "object" + }, + "FieldType": { + "enum": [ + "keyword", + "date", + "double", + "float", + "integer", + "bool", + "text", + "text_lang", + "taxonomy", + "disabled", + "object" + ], + "title": "FieldType", + "type": "string" + }, + "IndexConfig": { + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "id_field_name": { + "title": "Id Field Name", + "type": "string" + }, + "last_modified_field_name": { + "title": "Last Modified Field Name", + "type": "string" + }, + "number_of_shards": { + "default": 4, + "title": "Number Of Shards", + "type": "integer" + }, + "number_of_replicas": { + "default": 1, + "title": "Number Of Replicas", + "type": "integer" + } + }, + "required": [ + "name", + "id_field_name", + "last_modified_field_name" + ], + "title": "IndexConfig", + "type": "object" + }, + "TaxonomyConfig": { + "properties": { + "sources": { + "items": { + "$ref": "#/$defs/TaxonomySourceConfig" + }, + "title": "Sources", + "type": "array" + }, + "supported_langs": { + "items": { + "type": "string" + }, + "title": "Supported Langs", + "type": "array" + } + }, + "required": [ + "sources", + "supported_langs" + ], + "title": "TaxonomyConfig", + "type": "object" + }, + "TaxonomySourceConfig": { + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "url": { + "format": "uri", + "maxLength": 2083, + "minLength": 1, + "title": "Url", + "type": "string" + } + }, + "required": [ + "name", + "url" + ], + "title": "TaxonomySourceConfig", + "type": "object" + } + }, + "properties": { + "index": { + "$ref": "#/$defs/IndexConfig" + }, + "fields": { + "additionalProperties": { + "$ref": "#/$defs/FieldConfig" + }, + "title": "Fields", + "type": "object" + }, + "split_separator": { + "default": ",", + "title": "Split Separator", + "type": "string" + }, + "lang_separator": { + "default": "_", + "title": "Lang Separator", + "type": "string" + }, + "taxonomy": { + "$ref": "#/$defs/TaxonomyConfig" + }, + "preprocessor": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Preprocessor" + }, + "result_processor": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Result Processor" + }, + "supported_langs": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Supported Langs" + }, + "match_phrase_boost": { + "default": 2.0, + "title": "Match Phrase Boost", + "type": "number" + }, + "document_denylist": { + "items": { + "type": "string" + }, + "title": "Document Denylist", + "type": "array", + "uniqueItems": true + } + }, + "required": [ + "index", + "fields", + "taxonomy" + ], + "title": "Config", + "type": "object" +} \ No newline at end of file diff --git a/data/config/openfoodfacts.yml b/data/config/openfoodfacts.yml index 698f9da4..a09c743f 100644 --- a/data/config/openfoodfacts.yml +++ b/data/config/openfoodfacts.yml @@ -5,122 +5,122 @@ index: number_of_replicas: 1 number_of_shards: 4 fields: -- name: code - required: true - type: keyword -- name: obsolete - required: true - type: bool -- include_multi_match: true - name: product_name - type: text_lang -- include_multi_match: true - name: generic_name - type: text_lang -- name: abbreviated_product_name - type: text_lang -- include_multi_match: true - input_field: categories_tags - name: categories - taxonomy_name: category - type: taxonomy -- include_multi_match: true - input_field: labels_tags - name: labels - taxonomy_name: label - type: taxonomy -- include_multi_match: true - multi: true - name: brands - split: true - type: text -- multi: true - name: stores - split: true - type: text -- multi: true - name: emb_codes - split: true - type: text -- name: lang - type: keyword -- name: lc - type: keyword -- name: owner - type: keyword -- name: quantity - type: text -- multi: true - name: categories_tags - type: keyword -- multi: true - name: labels_tags - type: keyword -- multi: true - name: countries_tags - type: keyword -- multi: true - name: states_tags - type: keyword -- multi: true - name: origins_tags - type: keyword -- multi: true - name: ingredients_tags - type: keyword -- name: unique_scans_n - type: integer -- name: scans_n - type: integer -- name: nutrition_grades - type: keyword -- name: ecoscore_grade - type: keyword -- name: nova_groups - type: keyword -- name: last_modified_t - type: date -- name: created_t - type: date -- name: images - type: disabled -- name: additives_n - type: integer -- multi: true - name: allergens_tags - type: keyword -- name: ecoscore_data - type: disabled -- name: ecoscore_score - type: integer -- name: forest_footprint_data - type: disabled -- multi: true - name: ingredients_analysis_tags - type: keyword -- name: ingredients_n - type: integer -- name: nova_group - type: integer -- name: nutrient_levels - type: disabled -- name: nutriments - type: object -- name: nutriscore_data - type: disabled -- name: nutriscore_grade - type: keyword -- multi: true - name: traces_tags - type: keyword -- name: unknown_ingredients_n - type: integer -- name: popularity_key - type: integer -- name: nutriscore_score - type: integer -- name: completeness - type: float + code: + required: true + type: keyword + obsolete: + required: true + type: bool + product_name: + include_multi_match: true + type: text_lang + generic_name: + include_multi_match: true + type: text_lang + abbreviated_product_name: + type: text_lang + categories: + include_multi_match: true + input_field: categories_tags + taxonomy_name: category + type: taxonomy + labels: + include_multi_match: true + input_field: labels_tags + taxonomy_name: label + type: taxonomy + brands: + include_multi_match: true + multi: true + split: true + type: text + stores: + multi: true + split: true + type: text + emb_codes: + multi: true + split: true + type: text + lang: + type: keyword + lc: + type: keyword + owner: + type: keyword + quantity: + type: text + categories_tags: + multi: true + type: keyword + labels_tags: + multi: true + type: keyword + countries_tags: + multi: true + type: keyword + states_tags: + multi: true + type: keyword + origins_tags: + multi: true + type: keyword + ingredients_tags: + multi: true + type: keyword + unique_scans_n: + type: integer + scans_n: + type: integer + nutrition_grades: + type: keyword + ecoscore_grade: + type: keyword + nova_groups: + type: keyword + last_modified_t: + type: date + created_t: + type: date + images: + type: disabled + additives_n: + type: integer + allergens_tags: + multi: true + type: keyword + ecoscore_data: + type: disabled + ecoscore_score: + type: integer + forest_footprint_data: + type: disabled + ingredients_analysis_tags: + multi: true + type: keyword + ingredients_n: + type: integer + nova_group: + type: integer + nutrient_levels: + type: disabled + nutriments: + type: object + nutriscore_data: + type: disabled + nutriscore_grade: + type: keyword + traces_tags: + multi: true + type: keyword + unknown_ingredients_n: + type: integer + popularity_key: + type: integer + nutriscore_score: + type: integer + completeness: + type: float document_denylist: - '8901552007122' lang_separator: _