From a8f4fb3d9423fc3c1e2021e67d9209fdc3d8b9db Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 31 Jul 2024 18:16:56 +0100 Subject: [PATCH 01/77] Added a skeleton for AbstractDataCatalog and KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 272 ++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100644 kedro/io/data_catalog_redesign.py diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py new file mode 100644 index 0000000000..ce5e3269d1 --- /dev/null +++ b/kedro/io/data_catalog_redesign.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import abc +import copy +import re +from typing import Any + +from parse import parse + +from kedro.io.core import AbstractDataset, DatasetError, DatasetNotFoundError, Version + +Patterns = dict[str, dict[str, Any]] + +CREDENTIALS_KEY = "credentials" + + +def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Return a set of credentials from the provided credentials dict. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + +def _resolve_credentials( + config: dict[str, Any], credentials: dict[str, Any] +) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _map_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return _get_credentials(value, credentials) + if isinstance(value, dict): + return {k: _map_value(k, v) for k, v in value.items()} + return value + + return {k: _map_value(k, v) for k, v in config.items()} + + +class AbstractDataCatalog: + datasets = None + + def __init__( + self, + datasets: dict[str, AbstractDataset] | None = None, + config: dict[str, dict[str, Any]] | None = None, + credentials: dict[str, dict[str, Any]] | None = None, + ) -> None: + self.config = config or {} + self.resolved_ds_configs = {} + self.datasets = datasets or {} + + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + + def __iter__(self): + yield from self.datasets.values() + + @staticmethod + def _is_pattern(pattern: str) -> bool: + """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" + return "{" in pattern + + @staticmethod + def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + matches = ( + pattern + for pattern in dataset_patterns.keys() + if parse(pattern, dataset_name) + ) + return next(matches, None) + + @staticmethod + def _specificity(pattern: str) -> int: + """Helper function to check the length of exactly matched characters not inside brackets. + + Example: + :: + + >>> specificity("{namespace}.companies") = 10 + >>> specificity("{namespace}.{dataset}") = 1 + >>> specificity("france.companies") = 16 + """ + # Remove all the placeholders from the pattern and count the number of remaining chars + result = re.sub(r"\{.*?\}", "", pattern) + return len(result) + + @classmethod + def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules. + + In order: + + 1. Decreasing specificity (number of characters outside the curly brackets) + 2. Decreasing number of placeholders (number of curly bracket pairs) + 3. Alphabetically + """ + sorted_keys = sorted( + dataset_patterns, + key=lambda pattern: ( + -(cls._specificity(pattern)), + -pattern.count("{"), + pattern, + ), + ) + catch_all = [ + pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + ] + if len(catch_all) > 1: + raise DatasetError( + f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." + ) + return {key: dataset_patterns[key] for key in sorted_keys} + + @classmethod + def _get_patterns( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> tuple[Patterns, Patterns]: + dataset_patterns = {} + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + user_default = {} + + for ds_name, ds_config in config.items(): + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + if cls._is_pattern(ds_name): + dataset_patterns[ds_name] = resolved_ds_config + else: + cls.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + load_versions.get(ds_name), + save_version, + ) + + sorted_patterns = cls._sort_patterns(dataset_patterns) + if sorted_patterns: + # If the last pattern is a catch-all pattern, pop it and set it as the default + if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + last_pattern = sorted_patterns.popitem() + user_default = {last_pattern[0]: last_pattern[1]} + + missing_keys = [ + key + for key in load_versions.keys() + if not (key in config or cls._match_pattern(sorted_patterns, key)) + ] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) + + return sorted_patterns, user_default + + @classmethod + def _resolve_config( + cls, dataset_name: str, matched_pattern: str, config: dict + ) -> dict[str, Any]: + # get resolved dataset config + pass + + def resolve_patterns(self, datasets: str | list[str], **kwargs): + # Logic to resolve patterns and extend self.datasets with resolved names + # and self.resolved_config with resolved config + pass + + @classmethod + def from_config( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None = None, + **kwargs, + ) -> AbstractDataCatalog: + # Create a data catalog from configuration. + pass + + @abc.abstractmethod + def get_dataset(self, dataset_name: str, **kwargs) -> Any: + self.resolve_patterns(dataset_name, **kwargs) + # Specific dataset type logic + + @abc.abstractmethod + def get_dataset_config(self, dataset_name: str) -> dict: + # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns + pass + + +class KedroDataCatalog(AbstractDataCatalog): + def __init__( + self, + datasets: dict[str, AbstractDataset] | None = None, + config: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> None: + super().__init__(datasets, config) + + self._load_versions = load_versions or {} + self._save_version = save_version + + @classmethod + def from_config( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, + ) -> KedroDataCatalog: + pass + + def resolve_patterns( + self, + datasets: str | list[str], + version: Version | None = None, + suggest: bool = True, + ) -> None: + super().resolve_patterns(datasets) + # KedroDataCatalog related logic + + def get_dataset(self, dataset_name: str, **kwargs) -> AbstractDataset: + super().get_dataset(dataset_name, **kwargs) + dataset = self.datasets[dataset_name] + # Version related logic + return dataset + + def get_dataset_config(self, dataset_name: str) -> dict: + # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns + pass From 7d5681840e271199466c3bb57be56609c5ceb14d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 31 Jul 2024 19:20:58 +0100 Subject: [PATCH 02/77] Removed from_config method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 47 ++++++++++--------------------- 1 file changed, 15 insertions(+), 32 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index ce5e3269d1..290ca0796c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -81,6 +81,7 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) + # Add datasets to catalog def __iter__(self): yield from self.datasets.values() @@ -169,6 +170,7 @@ def _get_patterns( if cls._is_pattern(ds_name): dataset_patterns[ds_name] = resolved_ds_config else: + # TODO: Move to another method - see __init__ - add datasets to catalog cls.datasets[ds_name] = AbstractDataset.from_config( ds_name, resolved_ds_config, @@ -183,17 +185,6 @@ def _get_patterns( last_pattern = sorted_patterns.popitem() user_default = {last_pattern[0]: last_pattern[1]} - missing_keys = [ - key - for key in load_versions.keys() - if not (key in config or cls._match_pattern(sorted_patterns, key)) - ] - if missing_keys: - raise DatasetNotFoundError( - f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " - f"are not found in the catalog." - ) - return sorted_patterns, user_default @classmethod @@ -208,16 +199,6 @@ def resolve_patterns(self, datasets: str | list[str], **kwargs): # and self.resolved_config with resolved config pass - @classmethod - def from_config( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None = None, - **kwargs, - ) -> AbstractDataCatalog: - # Create a data catalog from configuration. - pass - @abc.abstractmethod def get_dataset(self, dataset_name: str, **kwargs) -> Any: self.resolve_patterns(dataset_name, **kwargs) @@ -230,27 +211,29 @@ def get_dataset_config(self, dataset_name: str) -> dict: class KedroDataCatalog(AbstractDataCatalog): - def __init__( + def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, config: dict[str, dict[str, Any]] | None = None, + credentials: dict[str, dict[str, Any]] | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> None: - super().__init__(datasets, config) + super().__init__(datasets, config, credentials) self._load_versions = load_versions or {} self._save_version = save_version - @classmethod - def from_config( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> KedroDataCatalog: - pass + missing_keys = [ + key + for key in load_versions.keys() + if not (key in config or self._match_pattern(self._dataset_patterns, key)) + ] + if missing_keys: + raise DatasetNotFoundError( + f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " + f"are not found in the catalog." + ) def resolve_patterns( self, From 0b80f23ae4534daec9524895f0b95a8218437f61 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 16:43:00 +0100 Subject: [PATCH 03/77] Implemented _init_datasets method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 75 ++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 290ca0796c..0db539b3b1 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -65,6 +65,15 @@ def _map_value(key: str, value: Any) -> Any: return {k: _map_value(k, v) for k, v in config.items()} +def validate_dataset_config(ds_name: str, ds_config: Any) -> None: + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) + + class AbstractDataCatalog: datasets = None @@ -81,7 +90,6 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - # Add datasets to catalog def __iter__(self): yield from self.datasets.values() @@ -143,13 +151,28 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} + @classmethod + def _init_datasets( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + for ds_name, ds_config in config.items(): + if not cls._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + cls.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + ) + @classmethod def _get_patterns( cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, ) -> tuple[Patterns, Patterns]: dataset_patterns = {} config = copy.deepcopy(config) or {} @@ -157,26 +180,12 @@ def _get_patterns( user_default = {} for ds_name, ds_config in config.items(): - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) if cls._is_pattern(ds_name): - dataset_patterns[ds_name] = resolved_ds_config - else: - # TODO: Move to another method - see __init__ - add datasets to catalog - cls.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - load_versions.get(ds_name), - save_version, + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials ) + dataset_patterns[ds_name] = resolved_ds_config sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -211,6 +220,9 @@ def get_dataset_config(self, dataset_name: str) -> dict: class KedroDataCatalog(AbstractDataCatalog): + _save_version = None + _load_versions = None + def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, @@ -224,6 +236,8 @@ def __init__( # noqa: PLR0913 self._load_versions = load_versions or {} self._save_version = save_version + self._init_datasets(config, credentials) + missing_keys = [ key for key in load_versions.keys() @@ -235,6 +249,25 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) + @classmethod + def _init_datasets( + self, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + for ds_name, ds_config in config.items(): + if not self._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) + resolved_ds_config = _resolve_credentials( # noqa: PLW2901 + ds_config, credentials + ) + self.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + resolved_ds_config, + self._load_versions.get(ds_name), + self._save_version, + ) + def resolve_patterns( self, datasets: str | list[str], From 5c727dfd8ccb79464b486f306762511ef436329d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 17:12:18 +0100 Subject: [PATCH 04/77] Implemented get dataset Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 76 +++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 15 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 0db539b3b1..eab64793af 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -2,12 +2,19 @@ import abc import copy +import difflib import re from typing import Any from parse import parse -from kedro.io.core import AbstractDataset, DatasetError, DatasetNotFoundError, Version +from kedro.io.core import ( + AbstractDataset, + AbstractVersionedDataset, + DatasetError, + DatasetNotFoundError, + Version, +) Patterns = dict[str, dict[str, Any]] @@ -151,19 +158,18 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} - @classmethod def _init_datasets( - cls, + self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): - if not cls._is_pattern(ds_name): + if not self._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) resolved_ds_config = _resolve_credentials( # noqa: PLW2901 ds_config, credentials ) - cls.datasets[ds_name] = AbstractDataset.from_config( + self.datasets[ds_name] = AbstractDataset.from_config( ds_name, resolved_ds_config, ) @@ -198,20 +204,55 @@ def _get_patterns( @classmethod def _resolve_config( - cls, dataset_name: str, matched_pattern: str, config: dict + cls, + dataset_name: str, + matched_pattern: str, + config: dict, ) -> dict[str, Any]: - # get resolved dataset config - pass + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, dataset_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = cls._resolve_config(dataset_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + cls._resolve_config(dataset_name, matched_pattern, value) + for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise DatasetError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + @abc.abstractmethod def resolve_patterns(self, datasets: str | list[str], **kwargs): # Logic to resolve patterns and extend self.datasets with resolved names # and self.resolved_config with resolved config pass @abc.abstractmethod - def get_dataset(self, dataset_name: str, **kwargs) -> Any: + def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: self.resolve_patterns(dataset_name, **kwargs) - # Specific dataset type logic + + if dataset_name not in self.datasets: + error_msg = f"Dataset '{dataset_name}' not found in the catalog" + + # Flag to turn on/off fuzzy-matching which can be time consuming and + # slow down plugins like `kedro-viz` + if suggest: + matches = difflib.get_close_matches(dataset_name, self.datasets.keys()) + if matches: + suggestions = ", ".join(matches) + error_msg += f" - did you mean one of these instead: {suggestions}" + raise DatasetNotFoundError(error_msg) + + return self.datasets[dataset_name] @abc.abstractmethod def get_dataset_config(self, dataset_name: str) -> dict: @@ -249,7 +290,6 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) - @classmethod def _init_datasets( self, config: dict[str, dict[str, Any]] | None, @@ -277,10 +317,16 @@ def resolve_patterns( super().resolve_patterns(datasets) # KedroDataCatalog related logic - def get_dataset(self, dataset_name: str, **kwargs) -> AbstractDataset: - super().get_dataset(dataset_name, **kwargs) - dataset = self.datasets[dataset_name] - # Version related logic + def get_dataset( + self, dataset_name: str, suggest: bool = True, version: Version | None = None + ) -> AbstractDataset: + dataset = super().get_dataset(dataset_name, suggest) + + if version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=version) + return dataset def get_dataset_config(self, dataset_name: str) -> dict: From 05c9171f313b1a5918fde6d437a97bac9d297059 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 2 Aug 2024 18:02:35 +0100 Subject: [PATCH 05/77] Started resolve_patterns implementation Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 48 +++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index eab64793af..ab37dba061 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -48,7 +48,7 @@ def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] + config: dict[str, Any], credentials: dict[str, Any] | None ) -> dict[str, Any]: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -93,10 +93,20 @@ def __init__( self.config = config or {} self.resolved_ds_configs = {} self.datasets = datasets or {} + self._dataset_patterns = {} + self._default_pattern = {} - self._dataset_patterns, self._default_pattern = self._get_patterns( - config, credentials - ) + # TODO: save resolved configs for two cases + + if config: + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + # Init datasets + + # TODO: resolve patterns - old init from constructor + if datasets: + pass def __iter__(self): yield from self.datasets.values() @@ -160,7 +170,7 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] def _init_datasets( self, - config: dict[str, dict[str, Any]] | None, + config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): @@ -231,10 +241,30 @@ def _resolve_config( return config @abc.abstractmethod - def resolve_patterns(self, datasets: str | list[str], **kwargs): - # Logic to resolve patterns and extend self.datasets with resolved names - # and self.resolved_config with resolved config - pass + def resolve_patterns( + self, datasets: str | list[str], **kwargs + ) -> dict[str, Any] | list[dict[str, Any]]: + if isinstance(datasets, str): + datasets = [datasets] + + # resolved_configs = [] + # + # for dataset_name in datasets: + # matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) + # + # if dataset_name not in self.datasets and matched_pattern: + # # If the dataset is a patterned dataset, materialise it and add it to + # # the catalog + # # TODO: Check how to save all resolved datasets configurations + # config_copy = copy.deepcopy( + # self._dataset_patterns.get(matched_pattern) + # or self._default_pattern.get(matched_pattern) + # or {} + # ) + # + # dataset_config = self._resolve_config( + # dataset_name, matched_pattern, config_copy + # ) @abc.abstractmethod def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: From 5c804d6ca911dc9c21b045ee4441eb25358000c4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 5 Aug 2024 18:07:45 +0100 Subject: [PATCH 06/77] Implemented resolve_patterns Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 219 +++++++++++++++--------------- 1 file changed, 113 insertions(+), 106 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index ab37dba061..6268d0842c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -81,7 +81,33 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: ) -class AbstractDataCatalog: +def _resolve_config( + dataset_name: str, + matched_pattern: str, + config: dict, +) -> dict[str, Any]: + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, dataset_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = _resolve_config(dataset_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + _resolve_config(dataset_name, matched_pattern, value) for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise DatasetError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + + +class AbstractDataCatalog(abc.ABC): datasets = None def __init__( @@ -90,27 +116,37 @@ def __init__( config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = config or {} + self.config = {} self.resolved_ds_configs = {} self.datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} - # TODO: save resolved configs for two cases + if datasets: + for ds_name in datasets: + self.resolved_ds_configs[ds_name] = {} if config: self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - # Init datasets - - # TODO: resolve patterns - old init from constructor - if datasets: - pass + self._update_ds_configs(config) + self._init_datasets(config, credentials) def __iter__(self): yield from self.datasets.values() + def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: + for ds_name, ds_config in config.items(): + if ds_name in self._dataset_patterns: + self.resolved_ds_configs[ds_name] = _resolve_config( + ds_name, ds_name, self._dataset_patterns[ds_name] + ) + else: + self.resolved_ds_configs[ds_name] = _resolve_config( + ds_name, ds_name, ds_config + ) + @staticmethod def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" @@ -168,9 +204,16 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} + @abc.abstractmethod + def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the '_init_dataset' method" + ) + def _init_datasets( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> None: for ds_name, ds_config in config.items(): @@ -179,10 +222,7 @@ def _init_datasets( resolved_ds_config = _resolve_credentials( # noqa: PLW2901 ds_config, credentials ) - self.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - ) + self._init_dataset(ds_name, resolved_ds_config) @classmethod def _get_patterns( @@ -212,82 +252,74 @@ def _get_patterns( return sorted_patterns, user_default - @classmethod - def _resolve_config( - cls, - dataset_name: str, - matched_pattern: str, - config: dict, - ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = cls._resolve_config(dataset_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [ - cls._resolve_config(dataset_name, matched_pattern, value) - for value in config - ] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - @abc.abstractmethod def resolve_patterns( - self, datasets: str | list[str], **kwargs + self, datasets: str | list[str], suggest: bool = True ) -> dict[str, Any] | list[dict[str, Any]]: if isinstance(datasets, str): - datasets = [datasets] - - # resolved_configs = [] - # - # for dataset_name in datasets: - # matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) - # - # if dataset_name not in self.datasets and matched_pattern: - # # If the dataset is a patterned dataset, materialise it and add it to - # # the catalog - # # TODO: Check how to save all resolved datasets configurations - # config_copy = copy.deepcopy( - # self._dataset_patterns.get(matched_pattern) - # or self._default_pattern.get(matched_pattern) - # or {} - # ) - # - # dataset_config = self._resolve_config( - # dataset_name, matched_pattern, config_copy - # ) + datasets_lst = [datasets] + else: + datasets_lst = datasets + + resolved_configs = [] + + for ds_name in datasets_lst: + matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) + if matched_pattern: + if ds_name not in self.datasets: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, + ) + resolved_configs.append(ds_config) + else: + resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + else: + resolved_configs.append(None) - @abc.abstractmethod - def get_dataset(self, dataset_name: str, suggest: bool = True, **kwargs) -> Any: - self.resolve_patterns(dataset_name, **kwargs) + if isinstance(datasets, str): + return resolved_configs[0] + else: + return resolved_configs + + def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: + ds_config = self.resolve_patterns(ds_name) - if dataset_name not in self.datasets: - error_msg = f"Dataset '{dataset_name}' not found in the catalog" + if ds_config is None: + error_msg = f"Dataset '{ds_name}' not found in the catalog" # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(dataset_name, self.datasets.keys()) + matches = difflib.get_close_matches(ds_name, self.datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) + elif ds_name not in self.datasets: + self._init_dataset(ds_name, ds_config) + self.resolved_ds_configs[ds_name] = ds_config - return self.datasets[dataset_name] + return self.datasets[ds_name] - @abc.abstractmethod - def get_dataset_config(self, dataset_name: str) -> dict: - # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns - pass + def get_dataset_config(self, ds_name: str) -> dict | None: + if ds_name in self.resolved_ds_configs: + return self.resolved_ds_configs[ds_name] + return None class KedroDataCatalog(AbstractDataCatalog): @@ -307,8 +339,6 @@ def __init__( # noqa: PLR0913 self._load_versions = load_versions or {} self._save_version = save_version - self._init_datasets(config, credentials) - missing_keys = [ key for key in load_versions.keys() @@ -320,32 +350,13 @@ def __init__( # noqa: PLR0913 f"are not found in the catalog." ) - def _init_datasets( - self, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, - ) -> None: - for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) - self.datasets[ds_name] = AbstractDataset.from_config( - ds_name, - resolved_ds_config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def resolve_patterns( - self, - datasets: str | list[str], - version: Version | None = None, - suggest: bool = True, - ) -> None: - super().resolve_patterns(datasets) - # KedroDataCatalog related logic + def _init_dataset(self, ds_name: str, config: dict[str, Any]): + self.datasets[ds_name] = AbstractDataset.from_config( + ds_name, + config, + self._load_versions.get(ds_name), + self._save_version, + ) def get_dataset( self, dataset_name: str, suggest: bool = True, version: Version | None = None @@ -358,7 +369,3 @@ def get_dataset( dataset = dataset._copy(_version=version) return dataset - - def get_dataset_config(self, dataset_name: str) -> dict: - # Logic to get dataset config from self.config and self._dataset_patterns, self._default_patterns - pass From 530f7d60a3d312688063a2f7b94886ed821451c3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 5 Aug 2024 20:21:10 +0100 Subject: [PATCH 07/77] Fixed credentials resolving Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 68 +++++++++++++++++-------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6268d0842c..346e422822 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -130,13 +130,19 @@ def __init__( self._dataset_patterns, self._default_pattern = self._get_patterns( config, credentials ) - self._update_ds_configs(config) + self._update_ds_configs(config, credentials) self._init_datasets(config, credentials) def __iter__(self): yield from self.datasets.values() - def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: + def _update_ds_configs( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None, + ) -> None: + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if ds_name in self._dataset_patterns: self.resolved_ds_configs[ds_name] = _resolve_config( @@ -144,7 +150,7 @@ def _update_ds_configs(self, config: dict[str, dict[str, Any]]) -> None: ) else: self.resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, ds_config + ds_name, ds_name, _resolve_credentials(ds_config, credentials) ) @staticmethod @@ -253,7 +259,7 @@ def _get_patterns( return sorted_patterns, user_default def resolve_patterns( - self, datasets: str | list[str], suggest: bool = True + self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: if isinstance(datasets, str): datasets_lst = [datasets] @@ -264,30 +270,29 @@ def resolve_patterns( for ds_name in datasets_lst: matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) - if matched_pattern: - if ds_name not in self.datasets: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or {} + if matched_pattern and ds_name not in self.datasets: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) - - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - else: - resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + resolved_configs.append(ds_config) + elif ds_name in self.datasets: + resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) else: resolved_configs.append(None) @@ -334,14 +339,17 @@ def __init__( # noqa: PLR0913 load_versions: dict[str, str] | None = None, save_version: str | None = None, ) -> None: - super().__init__(datasets, config, credentials) - self._load_versions = load_versions or {} self._save_version = save_version + super().__init__(datasets, config, credentials) + + # print(self.datasets) + # print(self.resolved_ds_configs) + missing_keys = [ key - for key in load_versions.keys() + for key in self._load_versions.keys() if not (key in config or self._match_pattern(self._dataset_patterns, key)) ] if missing_keys: From 64be83cab898099b8299a9588c741e4711c798ac Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 11:57:43 +0100 Subject: [PATCH 08/77] Updated match pattern Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 346e422822..37150d3b1c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -158,12 +158,11 @@ def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern - @staticmethod - def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: + def match_pattern(self, dataset_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" matches = ( pattern - for pattern in dataset_patterns.keys() + for pattern in self._dataset_patterns.keys() if parse(pattern, dataset_name) ) return next(matches, None) @@ -269,7 +268,7 @@ def resolve_patterns( resolved_configs = [] for ds_name in datasets_lst: - matched_pattern = self._match_pattern(self._dataset_patterns, ds_name) + matched_pattern = self.match_pattern(ds_name) if matched_pattern and ds_name not in self.datasets: # If the dataset is a patterned dataset, materialise it and add it to # the catalog @@ -345,12 +344,17 @@ def __init__( # noqa: PLR0913 super().__init__(datasets, config, credentials) # print(self.datasets) + # print("-") # print(self.resolved_ds_configs) + # print("-") + # print(self._dataset_patterns) + # print("-") + # print(self._default_pattern) missing_keys = [ key for key in self._load_versions.keys() - if not (key in config or self._match_pattern(self._dataset_patterns, key)) + if not (key in config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( From c29828a00bb04cd8e16649c7c57fa5eaf33bb933 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 13:41:21 +0100 Subject: [PATCH 09/77] Implemented add from dict method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 61 ++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 37150d3b1c..6bd33a6658 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -3,6 +3,7 @@ import abc import copy import difflib +import logging import re from typing import Any @@ -11,10 +12,12 @@ from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, + DatasetAlreadyExistsError, DatasetError, DatasetNotFoundError, Version, ) +from kedro.io.memory_dataset import MemoryDataset Patterns = dict[str, dict[str, Any]] @@ -112,7 +115,7 @@ class AbstractDataCatalog(abc.ABC): def __init__( self, - datasets: dict[str, AbstractDataset] | None = None, + datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: @@ -169,15 +172,7 @@ def match_pattern(self, dataset_name: str) -> str | None: @staticmethod def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets. - - Example: - :: - - >>> specificity("{namespace}.companies") = 10 - >>> specificity("{namespace}.{dataset}") = 1 - >>> specificity("france.companies") = 16 - """ + """Helper function to check the length of exactly matched characters not inside brackets.""" # Remove all the placeholders from the pattern and count the number of remaining chars result = re.sub(r"\{.*?\}", "", pattern) return len(result) @@ -320,10 +315,25 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: return self.datasets[ds_name] - def get_dataset_config(self, ds_name: str) -> dict | None: - if ds_name in self.resolved_ds_configs: - return self.resolved_ds_configs[ds_name] - return None + @abc.abstractmethod + def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the '_init_dataset' method" + ) + + def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: + """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + if dataset_name in self.datasets: + raise DatasetAlreadyExistsError( + f"Dataset '{dataset_name}' has already been registered" + ) + self.datasets[dataset_name] = dataset + self.resolved_ds_configs[dataset_name] = {} + + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) class KedroDataCatalog(AbstractDataCatalog): @@ -381,3 +391,26 @@ def get_dataset( dataset = dataset._copy(_version=version) return dataset + + def add( + self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + ) -> None: + """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + if dataset_name in self.datasets: + if replace: + self._logger.warning("Replacing dataset '%s'", dataset_name) + else: + raise DatasetAlreadyExistsError( + f"Dataset '{dataset_name}' has already been registered" + ) + self.datasets[dataset_name] = dataset + self.resolved_ds_configs[dataset_name] = {} + + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + for ds_name in datasets: + if isinstance(datasets[ds_name], AbstractDataset): + dataset = datasets[ds_name] + else: + dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] + + self.add(ds_name, dataset, replace) From 957403a080cae4d063c2ac636800a09e380e2021 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 15:31:13 +0100 Subject: [PATCH 10/77] Updated io __init__ Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 7902f866bd..c907a66136 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -13,6 +13,7 @@ Version, ) from .data_catalog import DataCatalog +from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -29,4 +30,6 @@ "MemoryDataset", "SharedMemoryDataset", "Version", + "AbstractDataCatalog", + "KedroDataCatalog", ] From 14908ff9d44bba0aaa6e7fa1b0ded13f4ad3d0b1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 15:34:21 +0100 Subject: [PATCH 11/77] Added list method Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6bd33a6658..45520e8c51 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -335,6 +335,29 @@ def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: def _logger(self) -> logging.Logger: return logging.getLogger(__name__) + def list(self, regex_search: str | None = None) -> list[str]: + """ + List of all dataset names registered in the catalog. + This can be filtered by providing an optional regular expression + which will only return matching keys. + """ + + if regex_search is None: + return list(self.datasets.keys()) + + if not regex_search.strip(): + self._logger.warning("The empty string will not match any data sets") + return [] + + try: + pattern = re.compile(regex_search, flags=re.IGNORECASE) + + except re.error as exc: + raise SyntaxError( + f"Invalid regular expression provided: '{regex_search}'" + ) from exc + return [ds_name for ds_name in self.datasets if pattern.search(ds_name)] + class KedroDataCatalog(AbstractDataCatalog): _save_version = None From c5e925bad2cce4e8fa537068c4c6d79a39460070 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 6 Aug 2024 23:00:39 +0100 Subject: [PATCH 12/77] Implemented _validate_missing_keys Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 45520e8c51..7c79ef7b69 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -111,15 +111,13 @@ def _resolve_config( class AbstractDataCatalog(abc.ABC): - datasets = None - def __init__( self, datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = {} + self.config = config or {} self.resolved_ds_configs = {} self.datasets = datasets or {} self._dataset_patterns = {} @@ -360,9 +358,6 @@ def list(self, regex_search: str | None = None) -> list[str]: class KedroDataCatalog(AbstractDataCatalog): - _save_version = None - _load_versions = None - def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, @@ -376,18 +371,13 @@ def __init__( # noqa: PLR0913 super().__init__(datasets, config, credentials) - # print(self.datasets) - # print("-") - # print(self.resolved_ds_configs) - # print("-") - # print(self._dataset_patterns) - # print("-") - # print(self._default_pattern) + self._validate_missing_keys() + def _validate_missing_keys(self) -> None: missing_keys = [ key for key in self._load_versions.keys() - if not (key in config or self.match_pattern(key)) + if not (key in self.config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( From b9a92b0173ea934975e5a289df112960520b6332 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 7 Aug 2024 15:05:58 +0100 Subject: [PATCH 13/77] Added datasets access logic Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 80 +++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 7c79ef7b69..6e15cec62c 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -117,15 +117,15 @@ def __init__( config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ) -> None: - self.config = config or {} - self.resolved_ds_configs = {} - self.datasets = datasets or {} + self._config = config or {} + self._resolved_ds_configs = {} + self._datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} if datasets: for ds_name in datasets: - self.resolved_ds_configs[ds_name] = {} + self._resolved_ds_configs[ds_name] = {} if config: self._dataset_patterns, self._default_pattern = self._get_patterns( @@ -134,8 +134,40 @@ def __init__( self._update_ds_configs(config, credentials) self._init_datasets(config, credentials) + @property + def datasets(self): + return copy.deepcopy(self._datasets) + + @datasets.setter + def datasets(self, value: Any): + msg = "Operation not allowed! Please change datasets through configuration." + raise AttributeError(msg) + + @property + def resolved_ds_configs(self): + return copy.deepcopy(self._resolved_ds_configs) + + @resolved_ds_configs.setter + def resolved_ds_configs(self, value: Any): + msg = "Operation not allowed! Please change datasets through configuration." + raise AttributeError(msg) + + @property + def dataset_patterns(self): + return self._dataset_patterns + + @property + def default_pattern(self): + return self._default_pattern + def __iter__(self): - yield from self.datasets.values() + yield from self._datasets.values() + + def __getitem__(self, ds_name: str) -> Any: + return self.get_dataset(ds_name) + + def _ipython_key_completions_(self) -> list[str]: + return list(self._datasets.keys()) def _update_ds_configs( self, @@ -146,11 +178,11 @@ def _update_ds_configs( credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if ds_name in self._dataset_patterns: - self.resolved_ds_configs[ds_name] = _resolve_config( + self._resolved_ds_configs[ds_name] = _resolve_config( ds_name, ds_name, self._dataset_patterns[ds_name] ) else: - self.resolved_ds_configs[ds_name] = _resolve_config( + self._resolved_ds_configs[ds_name] = _resolve_config( ds_name, ds_name, _resolve_credentials(ds_config, credentials) ) @@ -262,7 +294,7 @@ def resolve_patterns( for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self.datasets: + if matched_pattern and ds_name not in self._datasets: # If the dataset is a patterned dataset, materialise it and add it to # the catalog config_copy = copy.deepcopy( @@ -283,8 +315,8 @@ def resolve_patterns( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self.datasets: - resolved_configs.append(self.resolved_ds_configs.get(ds_name, {})) + elif ds_name in self._datasets: + resolved_configs.append(self._resolved_ds_configs.get(ds_name, {})) else: resolved_configs.append(None) @@ -302,16 +334,16 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self.datasets.keys()) + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) - elif ds_name not in self.datasets: + elif ds_name not in self._datasets: self._init_dataset(ds_name, ds_config) - self.resolved_ds_configs[ds_name] = ds_config + self._resolved_ds_configs[ds_name] = ds_config - return self.datasets[ds_name] + return self._datasets[ds_name] @abc.abstractmethod def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: @@ -322,12 +354,12 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self.datasets: + if dataset_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{dataset_name}' has already been registered" ) - self.datasets[dataset_name] = dataset - self.resolved_ds_configs[dataset_name] = {} + self._datasets[dataset_name] = dataset + self._resolved_ds_configs[dataset_name] = {} @property def _logger(self) -> logging.Logger: @@ -341,7 +373,7 @@ def list(self, regex_search: str | None = None) -> list[str]: """ if regex_search is None: - return list(self.datasets.keys()) + return list(self._datasets.keys()) if not regex_search.strip(): self._logger.warning("The empty string will not match any data sets") @@ -354,7 +386,7 @@ def list(self, regex_search: str | None = None) -> list[str]: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [ds_name for ds_name in self.datasets if pattern.search(ds_name)] + return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] class KedroDataCatalog(AbstractDataCatalog): @@ -377,7 +409,7 @@ def _validate_missing_keys(self) -> None: missing_keys = [ key for key in self._load_versions.keys() - if not (key in self.config or self.match_pattern(key)) + if not (key in self._config or self.match_pattern(key)) ] if missing_keys: raise DatasetNotFoundError( @@ -386,7 +418,7 @@ def _validate_missing_keys(self) -> None: ) def _init_dataset(self, ds_name: str, config: dict[str, Any]): - self.datasets[ds_name] = AbstractDataset.from_config( + self._datasets[ds_name] = AbstractDataset.from_config( ds_name, config, self._load_versions.get(ds_name), @@ -409,15 +441,15 @@ def add( self, dataset_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self.datasets: + if dataset_name in self._datasets: if replace: self._logger.warning("Replacing dataset '%s'", dataset_name) else: raise DatasetAlreadyExistsError( f"Dataset '{dataset_name}' has already been registered" ) - self.datasets[dataset_name] = dataset - self.resolved_ds_configs[dataset_name] = {} + self._datasets[dataset_name] = dataset + self._resolved_ds_configs[dataset_name] = {} def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: for ds_name in datasets: From 2f3259378e92bc10b023c82b4ce9ceb1387046f5 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 7 Aug 2024 19:10:43 +0100 Subject: [PATCH 14/77] Added __contains__ and comments on lazy loading Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 6e15cec62c..775264115f 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -166,6 +166,13 @@ def __iter__(self): def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) + def __contains__(self, dataset_name: str) -> bool: + """Check if an item is in the catalog as a materialised dataset or pattern""" + matched_pattern = self.match_pattern(dataset_name) + if dataset_name in self._datasets or matched_pattern: + return True + return False + def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) @@ -249,9 +256,7 @@ def _init_datasets( for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) + resolved_ds_config = _resolve_credentials(ds_config, credentials) self._init_dataset(ds_name, resolved_ds_config) @classmethod @@ -268,9 +273,7 @@ def _get_patterns( for ds_name, ds_config in config.items(): if cls._is_pattern(ds_name): validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) + resolved_ds_config = _resolve_credentials(ds_config, credentials) dataset_patterns[ds_name] = resolved_ds_config sorted_patterns = cls._sort_patterns(dataset_patterns) @@ -418,6 +421,9 @@ def _validate_missing_keys(self) -> None: ) def _init_dataset(self, ds_name: str, config: dict[str, Any]): + # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Initialise actual dataset when load or save + # Add is_ds_init property self._datasets[ds_name] = AbstractDataset.from_config( ds_name, config, From d1ea64ec59a71a0907e92b1f53e83fcce39813c8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 8 Aug 2024 11:57:23 +0100 Subject: [PATCH 15/77] Renamed dataset_name to ds_name Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 46 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 775264115f..82e40cb214 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -85,20 +85,18 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: def _resolve_config( - dataset_name: str, + ds_name: str, matched_pattern: str, config: dict, ) -> dict[str, Any]: """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) + result = parse(matched_pattern, ds_name) # Resolve the factory config for the dataset if isinstance(config, dict): for key, value in config.items(): - config[key] = _resolve_config(dataset_name, matched_pattern, value) + config[key] = _resolve_config(ds_name, matched_pattern, value) elif isinstance(config, (list, tuple)): - config = [ - _resolve_config(dataset_name, matched_pattern, value) for value in config - ] + config = [_resolve_config(ds_name, matched_pattern, value) for value in config] elif isinstance(config, str) and "}" in config: try: config = str(config).format_map(result.named) @@ -166,10 +164,10 @@ def __iter__(self): def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) - def __contains__(self, dataset_name: str) -> bool: + def __contains__(self, ds_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self.match_pattern(dataset_name) - if dataset_name in self._datasets or matched_pattern: + matched_pattern = self.match_pattern(ds_name) + if ds_name in self._datasets or matched_pattern: return True return False @@ -198,12 +196,12 @@ def _is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern - def match_pattern(self, dataset_name: str) -> str | None: + def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" matches = ( pattern for pattern in self._dataset_patterns.keys() - if parse(pattern, dataset_name) + if parse(pattern, ds_name) ) return next(matches, None) @@ -355,14 +353,14 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: f"it must implement the '_init_dataset' method" ) - def add(self, dataset_name: str, dataset: Any, **kwargs) -> None: + def add(self, ds_name: str, dataset: Any, **kwargs) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self._datasets: + if ds_name in self._datasets: raise DatasetAlreadyExistsError( - f"Dataset '{dataset_name}' has already been registered" + f"Dataset '{ds_name}' has already been registered" ) - self._datasets[dataset_name] = dataset - self._resolved_ds_configs[dataset_name] = {} + self._datasets[ds_name] = dataset + self._resolved_ds_configs[ds_name] = {} @property def _logger(self) -> logging.Logger: @@ -432,9 +430,9 @@ def _init_dataset(self, ds_name: str, config: dict[str, Any]): ) def get_dataset( - self, dataset_name: str, suggest: bool = True, version: Version | None = None + self, ds_name: str, suggest: bool = True, version: Version | None = None ) -> AbstractDataset: - dataset = super().get_dataset(dataset_name, suggest) + dataset = super().get_dataset(ds_name, suggest) if version and isinstance(dataset, AbstractVersionedDataset): # we only want to return a similar-looking dataset, @@ -444,18 +442,18 @@ def get_dataset( return dataset def add( - self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" - if dataset_name in self._datasets: + if ds_name in self._datasets: if replace: - self._logger.warning("Replacing dataset '%s'", dataset_name) + self._logger.warning("Replacing dataset '%s'", ds_name) else: raise DatasetAlreadyExistsError( - f"Dataset '{dataset_name}' has already been registered" + f"Dataset '{ds_name}' has already been registered" ) - self._datasets[dataset_name] = dataset - self._resolved_ds_configs[dataset_name] = {} + self._datasets[ds_name] = dataset + self._resolved_ds_configs[ds_name] = {} def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: for ds_name in datasets: From fb89fca9041cc0aff46ce6073ca570da553652c8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 8 Aug 2024 12:00:13 +0100 Subject: [PATCH 16/77] Updated some docstrings Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 82e40cb214..3bef34bd53 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -354,7 +354,7 @@ def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: ) def add(self, ds_name: str, dataset: Any, **kwargs) -> None: - """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + """Adds a new dataset object to the ``AbstractDataCatalog``.""" if ds_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{ds_name}' has already been registered" @@ -444,7 +444,7 @@ def get_dataset( def add( self, ds_name: str, dataset: AbstractDataset, replace: bool = False ) -> None: - """Adds a new ``AbstractDataset`` object to the ``DataCatalog``.""" + """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" if ds_name in self._datasets: if replace: self._logger.warning("Replacing dataset '%s'", ds_name) From c6676459338507468dd4deba4066d71eff0ee9ef Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 17:32:20 +0100 Subject: [PATCH 17/77] Fixed _update_ds_configs Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 3bef34bd53..a8d929a8e5 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -182,13 +182,9 @@ def _update_ds_configs( config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): - if ds_name in self._dataset_patterns: - self._resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, self._dataset_patterns[ds_name] - ) - else: - self._resolved_ds_configs[ds_name] = _resolve_config( - ds_name, ds_name, _resolve_credentials(ds_config, credentials) + if not self._is_pattern(ds_name): + self._resolved_ds_configs[ds_name] = _resolve_credentials( + ds_config, credentials ) @staticmethod From be8e929f2553a9b7d127034b45be6b3b63ee6d04 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 18:47:21 +0100 Subject: [PATCH 18/77] Fixed _init_datasets Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index a8d929a8e5..fc37e2f3ec 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -24,6 +24,14 @@ CREDENTIALS_KEY = "credentials" +class DatasetConfigurationNotFoundError(DatasetError): + """``DatasetConfigurationNotFoundError`` raised by ``DataCatalog`` class in case of + trying to get non-existing dataset configuration. + """ + + pass + + def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: """Return a set of credentials from the provided credentials dict. @@ -130,7 +138,7 @@ def __init__( config, credentials ) self._update_ds_configs(config, credentials) - self._init_datasets(config, credentials) + self._init_datasets(config) @property def datasets(self): @@ -183,6 +191,7 @@ def _update_ds_configs( credentials = copy.deepcopy(credentials) or {} for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): + validate_dataset_config(ds_name, ds_config) self._resolved_ds_configs[ds_name] = _resolve_credentials( ds_config, credentials ) @@ -245,13 +254,15 @@ def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: def _init_datasets( self, config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, ) -> None: - for ds_name, ds_config in config.items(): + for ds_name in config: if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials(ds_config, credentials) - self._init_dataset(ds_name, resolved_ds_config) + ds_resolved_config = self._resolved_ds_configs.get(ds_name, None) + if not ds_resolved_config: + raise DatasetConfigurationNotFoundError( + f"Dataset '{ds_name}' configuration is missing." + ) + self._init_dataset(ds_name, ds_resolved_config) @classmethod def _get_patterns( @@ -346,7 +357,7 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: raise NotImplementedError( f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the '_init_dataset' method" + f"it must implement the 'add_from_dict' method" ) def add(self, ds_name: str, dataset: Any, **kwargs) -> None: From ec7ac394f5291c870f18549b70669be08f72d959 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 12 Aug 2024 19:15:12 +0100 Subject: [PATCH 19/77] Implemented add_runtime_patterns Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index fc37e2f3ec..1fbce495fe 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -128,6 +128,7 @@ def __init__( self._datasets = datasets or {} self._dataset_patterns = {} self._default_pattern = {} + self._runtime_patterns = {} if datasets: for ds_name in datasets: @@ -203,11 +204,10 @@ def _is_pattern(pattern: str) -> bool: def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" - matches = ( - pattern - for pattern in self._dataset_patterns.keys() - if parse(pattern, ds_name) - ) + all_patterns = list(self._dataset_patterns.keys()) + all_patterns.extend(list(self._default_pattern.keys())) + all_patterns.extend(list(self._runtime_patterns.keys())) + matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) @staticmethod @@ -348,8 +348,8 @@ def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) elif ds_name not in self._datasets: - self._init_dataset(ds_name, ds_config) self._resolved_ds_configs[ds_name] = ds_config + self._init_dataset(ds_name, ds_config) return self._datasets[ds_name] @@ -396,6 +396,10 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} + self._runtime_patterns = self._sort_patterns(self._runtime_patterns) + class KedroDataCatalog(AbstractDataCatalog): def __init__( # noqa: PLR0913 From 8e234507ec305d126294b773c1c5849f86b5d6e7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 13 Aug 2024 12:10:12 +0100 Subject: [PATCH 20/77] Fixed runtime patterns usage Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 131 ++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 1fbce495fe..c43f221984 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -18,6 +18,7 @@ Version, ) from kedro.io.memory_dataset import MemoryDataset +from kedro.logging import _format_rich, _has_rich_handler Patterns = dict[str, dict[str, Any]] @@ -308,6 +309,7 @@ def resolve_patterns( config_copy = copy.deepcopy( self._dataset_patterns.get(matched_pattern) or self._default_pattern.get(matched_pattern) + or self._runtime_patterns.get(matched_pattern) or {} ) ds_config = _resolve_config(ds_name, matched_pattern, config_copy) @@ -396,6 +398,61 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] + @abc.abstractmethod + def load(self, name: str, **kwargs) -> Any: + raise NotImplementedError( + f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " + f"it must implement the 'load' method" + ) + + def save(self, name: str, data: Any) -> None: + """Save data to a registered data set. + + Args: + name: A data set to be saved to. + data: A data object to be saved as configured in the registered + data set. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + + Example: + :: + + >>> import pandas as pd + >>> + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = pd.DataFrame({'col1': [1, 2], + >>> 'col2': [4, 5], + >>> 'col3': [5, 6]}) + >>> catalog.save("cars", df) + """ + dataset = self.get_dataset(name) + + self._logger.info( + "Saving data to %s (%s)...", + _format_rich(name, "dark_orange") + if _has_rich_handler(self._logger) + else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + dataset.save(data) + + def release(self, name: str) -> None: + pass + + def confirm(self, name: str) -> None: + pass + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} self._runtime_patterns = self._sort_patterns(self._runtime_patterns) @@ -474,3 +531,77 @@ def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] self.add(ds_name, dataset, replace) + + def load(self, name: str, version: str | None = None) -> Any: + """Loads a registered data set. + + Args: + name: A data set to be loaded. + version: Optional argument for concrete data version to be loaded. + Works only with versioned datasets. + + Returns: + The loaded data as configured. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + + Example: + :: + + >>> from kedro.io import DataCatalog + >>> from kedro_datasets.pandas import CSVDataset + >>> + >>> cars = CSVDataset(filepath="cars.csv", + >>> load_args=None, + >>> save_args={"index": False}) + >>> catalog = DataCatalog(datasets={'cars': cars}) + >>> + >>> df = catalog.load("cars") + """ + load_version = Version(version, None) if version else None + dataset = self.get_dataset(name, version=load_version) + + self._logger.info( + "Loading data from %s (%s)...", + _format_rich(name, "dark_orange") + if _has_rich_handler(self._logger) + else name, + type(dataset).__name__, + extra={"markup": True}, + ) + + result = dataset.load() + + return result + + def release(self, name: str) -> None: + """Release any cached data associated with a data set + + Args: + name: A data set to be checked. + + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + """ + dataset = self.get_dataset(name) + dataset.release() + + def confirm(self, name: str) -> None: + """Confirm a dataset by its name. + + Args: + name: Name of the dataset. + Raises: + DatasetError: When the dataset does not have `confirm` method. + + """ + self._logger.info("Confirming dataset '%s'", name) + dataset = self.get_dataset(name) + + if hasattr(dataset, "confirm"): + dataset.confirm() + else: + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") From 50bc8165297030890dd05d5c0da73c237fc604b8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 21 Aug 2024 16:20:13 +0100 Subject: [PATCH 21/77] Moved pattern logic out of data catalog, implemented KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/config/__init__.py | 2 + kedro/config/config_resolver.py | 237 ++++++++++++++++ kedro/io/__init__.py | 3 +- kedro/io/data_catalog_redesign.py | 455 ++++++------------------------ 4 files changed, 324 insertions(+), 373 deletions(-) create mode 100644 kedro/config/config_resolver.py diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index 500cd62615..9b47743bfe 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -7,11 +7,13 @@ BadConfigException, MissingConfigException, ) +from .config_resolver import ConfigResolver from .omegaconf_config import OmegaConfigLoader __all__ = [ "AbstractConfigLoader", "BadConfigException", + "ConfigResolver", "MissingConfigException", "OmegaConfigLoader", ] diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py new file mode 100644 index 0000000000..31fec7a339 --- /dev/null +++ b/kedro/config/config_resolver.py @@ -0,0 +1,237 @@ +import copy +import logging +import re +from typing import Any + +from parse import parse + +Patterns = dict[str, dict[str, Any]] + +CREDENTIALS_KEY = "credentials" + + +def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Return a set of credentials from the provided credentials dict. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + +def _resolve_credentials( + config: dict[str, Any], credentials: dict[str, Any] | None +) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _map_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return _get_credentials(value, credentials) + if isinstance(value, dict): + return {k: _map_value(k, v) for k, v in value.items()} + return value + + return {k: _map_value(k, v) for k, v in config.items()} + + +def _resolve_config( + ds_name: str, + matched_pattern: str, + config: dict, +) -> dict[str, Any]: + """Get resolved AbstractDataset from a factory config""" + result = parse(matched_pattern, ds_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = _resolve_config(ds_name, matched_pattern, value) + elif isinstance(config, (list, tuple)): + config = [_resolve_config(ds_name, matched_pattern, value) for value in config] + elif isinstance(config, str) and "}" in config: + try: + config = str(config).format_map(result.named) + except KeyError as exc: + raise KeyError( + f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + + +class ConfigResolver: + def __init__( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None = None, + ): + self._runtime_patterns = {} + self._dataset_patterns, self._default_pattern = self._get_patterns( + config, credentials + ) + + self._ds_configs = self._get_ds_configs(config, credentials) + + @property + def _logger(self) -> logging.Logger: + return logging.getLogger(__name__) + + @staticmethod + def _is_pattern(pattern: str) -> bool: + """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" + return "{" in pattern + + @staticmethod + def _specificity(pattern: str) -> int: + """Helper function to check the length of exactly matched characters not inside brackets.""" + # Remove all the placeholders from the pattern and count the number of remaining chars + result = re.sub(r"\{.*?\}", "", pattern) + return len(result) + + @classmethod + def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + """Sort a dictionary of dataset patterns according to parsing rules. + + In order: + + 1. Decreasing specificity (number of characters outside the curly brackets) + 2. Decreasing number of placeholders (number of curly bracket pairs) + 3. Alphabetically + """ + sorted_keys = sorted( + dataset_patterns, + key=lambda pattern: ( + -(cls._specificity(pattern)), + -pattern.count("{"), + pattern, + ), + ) + catch_all = [ + pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + ] + if len(catch_all) > 1: + raise ValueError( + f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." + ) + return {key: dataset_patterns[key] for key in sorted_keys} + + def match_pattern(self, ds_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + all_patterns = list(self._dataset_patterns.keys()) + all_patterns.extend(list(self._default_pattern.keys())) + all_patterns.extend(list(self._runtime_patterns.keys())) + matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) + return next(matches, None) + + @classmethod + def _get_patterns( + cls, + config: dict[str, dict[str, Any]] | None, + credentials: dict[str, dict[str, Any]] | None, + ) -> tuple[Patterns, Patterns]: + dataset_patterns = {} + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + user_default = {} + + for ds_name, ds_config in config.items(): + if cls._is_pattern(ds_name): + resolved_ds_config = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = resolved_ds_config + + sorted_patterns = cls._sort_patterns(dataset_patterns) + if sorted_patterns: + # If the last pattern is a catch-all pattern, pop it and set it as the default + if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + last_pattern = sorted_patterns.popitem() + user_default = {last_pattern[0]: last_pattern[1]} + + return sorted_patterns, user_default + + def _get_ds_configs( + self, + config: dict[str, dict[str, Any]], + credentials: dict[str, dict[str, Any]] | None, + ) -> dict[str, dict[str, Any]]: + config = copy.deepcopy(config) or {} + credentials = copy.deepcopy(credentials) or {} + ds_configs = {} + for ds_name, ds_config in config.items(): + if not self._is_pattern(ds_name): + ds_configs[ds_name] = _resolve_credentials(ds_config, credentials) + + return ds_configs + + def resolve_patterns( + self, datasets: str | list[str] + ) -> dict[str, Any] | list[dict[str, Any]]: + if isinstance(datasets, str): + datasets_lst = [datasets] + else: + datasets_lst = datasets + + resolved_configs = [] + + for ds_name in datasets_lst: + matched_pattern = self.match_pattern(ds_name) + if matched_pattern and ds_name not in self._ds_configs: + # If the dataset is a patterned dataset, materialise it and add it to + # the catalog + config_copy = copy.deepcopy( + self._dataset_patterns.get(matched_pattern) + or self._default_pattern.get(matched_pattern) + or self._runtime_patterns.get(matched_pattern) + or {} + ) + ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + + if ( + self._specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, + ) + resolved_configs.append(ds_config) + elif ds_name in self._ds_configs: + resolved_configs.append(self._ds_configs.get(ds_name)) + else: + resolved_configs.append(None) + + if isinstance(datasets, str): + return resolved_configs[0] + else: + return resolved_configs + + def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} + self._runtime_patterns = self._sort_patterns(self._runtime_patterns) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 1bc285067f..df7880557e 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -14,7 +14,7 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import AbstractDataCatalog, KedroDataCatalog +from .data_catalog_redesign import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -31,6 +31,5 @@ "MemoryDataset", "SharedMemoryDataset", "Version", - "AbstractDataCatalog", "KedroDataCatalog", ] diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index c43f221984..cad4950c56 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -1,14 +1,11 @@ from __future__ import annotations -import abc import copy import difflib import logging import re from typing import Any -from parse import parse - from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -18,7 +15,7 @@ Version, ) from kedro.io.memory_dataset import MemoryDataset -from kedro.logging import _format_rich, _has_rich_handler +from kedro.utils import _format_rich, _has_rich_handler Patterns = dict[str, dict[str, Any]] @@ -33,57 +30,6 @@ class DatasetConfigurationNotFoundError(DatasetError): pass -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _map_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) - if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} - return value - - return {k: _map_value(k, v) for k, v in config.items()} - - def validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): raise DatasetError( @@ -93,55 +39,30 @@ def validate_dataset_config(ds_name: str, ds_config: Any) -> None: ) -def _resolve_config( - ds_name: str, - matched_pattern: str, - config: dict, -) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, ds_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = _resolve_config(ds_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [_resolve_config(ds_name, matched_pattern, value) for value in config] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - -class AbstractDataCatalog(abc.ABC): +class KedroDataCatalog: def __init__( self, datasets: dict[str, Any] | None = None, config: dict[str, dict[str, Any]] | None = None, - credentials: dict[str, dict[str, Any]] | None = None, + load_versions: dict[str, str] | None = None, + save_version: str | None = None, ) -> None: self._config = config or {} - self._resolved_ds_configs = {} self._datasets = datasets or {} - self._dataset_patterns = {} - self._default_pattern = {} self._runtime_patterns = {} + self._load_versions = load_versions or {} + self._save_version = save_version if datasets: for ds_name in datasets: - self._resolved_ds_configs[ds_name] = {} + # TODO: API to get configuration from dataset + self._config[ds_name] = {} if config: - self._dataset_patterns, self._default_pattern = self._get_patterns( - config, credentials - ) - self._update_ds_configs(config, credentials) self._init_datasets(config) + self._validate_missing_keys() + @property def datasets(self): return copy.deepcopy(self._datasets) @@ -152,22 +73,14 @@ def datasets(self, value: Any): raise AttributeError(msg) @property - def resolved_ds_configs(self): - return copy.deepcopy(self._resolved_ds_configs) + def config(self): + return copy.deepcopy(self._config) - @resolved_ds_configs.setter - def resolved_ds_configs(self, value: Any): + @config.setter + def config(self, value: Any): msg = "Operation not allowed! Please change datasets through configuration." raise AttributeError(msg) - @property - def dataset_patterns(self): - return self._dataset_patterns - - @property - def default_pattern(self): - return self._default_pattern - def __iter__(self): yield from self._datasets.values() @@ -175,201 +88,74 @@ def __getitem__(self, ds_name: str) -> Any: return self.get_dataset(ds_name) def __contains__(self, ds_name: str) -> bool: - """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self.match_pattern(ds_name) - if ds_name in self._datasets or matched_pattern: + """Check if an item is in the catalog""" + if ds_name in self._datasets: return True return False def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def _update_ds_configs( - self, - config: dict[str, dict[str, Any]], - credentials: dict[str, dict[str, Any]] | None, - ) -> None: - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} - for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - self._resolved_ds_configs[ds_name] = _resolve_credentials( - ds_config, credentials - ) - - @staticmethod - def _is_pattern(pattern: str) -> bool: - """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" - return "{" in pattern - - def match_pattern(self, ds_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" - all_patterns = list(self._dataset_patterns.keys()) - all_patterns.extend(list(self._default_pattern.keys())) - all_patterns.extend(list(self._runtime_patterns.keys())) - matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) - return next(matches, None) - - @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets.""" - # Remove all the placeholders from the pattern and count the number of remaining chars - result = re.sub(r"\{.*?\}", "", pattern) - return len(result) - - @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: - """Sort a dictionary of dataset patterns according to parsing rules. - - In order: - - 1. Decreasing specificity (number of characters outside the curly brackets) - 2. Decreasing number of placeholders (number of curly bracket pairs) - 3. Alphabetically - """ - sorted_keys = sorted( - dataset_patterns, - key=lambda pattern: ( - -(cls._specificity(pattern)), - -pattern.count("{"), - pattern, - ), - ) - catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 - ] - if len(catch_all) > 1: - raise DatasetError( - f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." - ) - return {key: dataset_patterns[key] for key in sorted_keys} - - @abc.abstractmethod - def _init_dataset(self, ds_name: str, config: dict[str, Any]) -> None: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the '_init_dataset' method" - ) - def _init_datasets( self, config: dict[str, dict[str, Any]] | None, ) -> None: - for ds_name in config: - if not self._is_pattern(ds_name): - ds_resolved_config = self._resolved_ds_configs.get(ds_name, None) - if not ds_resolved_config: - raise DatasetConfigurationNotFoundError( - f"Dataset '{ds_name}' configuration is missing." - ) - self._init_dataset(ds_name, ds_resolved_config) - - @classmethod - def _get_patterns( - cls, - config: dict[str, dict[str, Any]] | None, - credentials: dict[str, dict[str, Any]] | None, - ) -> tuple[Patterns, Patterns]: - dataset_patterns = {} - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} - user_default = {} - for ds_name, ds_config in config.items(): - if cls._is_pattern(ds_name): - validate_dataset_config(ds_name, ds_config) - resolved_ds_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_ds_config - - sorted_patterns = cls._sort_patterns(dataset_patterns) - if sorted_patterns: - # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: - last_pattern = sorted_patterns.popitem() - user_default = {last_pattern[0]: last_pattern[1]} - - return sorted_patterns, user_default - - def resolve_patterns( - self, datasets: str | list[str] - ) -> dict[str, Any] | list[dict[str, Any]]: - if isinstance(datasets, str): - datasets_lst = [datasets] - else: - datasets_lst = datasets - - resolved_configs = [] - - for ds_name in datasets_lst: - matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._datasets: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or self._runtime_patterns.get(matched_pattern) - or {} - ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) - - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - elif ds_name in self._datasets: - resolved_configs.append(self._resolved_ds_configs.get(ds_name, {})) - else: - resolved_configs.append(None) + validate_dataset_config(ds_name, ds_config) + self._init_dataset(ds_name, ds_config) - if isinstance(datasets, str): - return resolved_configs[0] - else: - return resolved_configs + def _init_dataset(self, ds_name: str, config: dict[str, Any]): + # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Initialise actual dataset when load or save + # Add is_ds_init property + if ds_name in self._datasets: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) + self._datasets[ds_name] = AbstractDataset.from_config( + ds_name, + config, + self._load_versions.get(ds_name), + self._save_version, + ) - def get_dataset(self, ds_name: str, suggest: bool = True) -> Any: - ds_config = self.resolve_patterns(ds_name) + def get_dataset( + self, ds_name: str, suggest: bool = True, version: Version | None = None + ) -> AbstractDataset: + dataset = self._datasets.get(ds_name, None) - if ds_config is None: + if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self._datasets.keys()) + matches = difflib.get_close_matches(ds_name, self._config.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) - elif ds_name not in self._datasets: - self._resolved_ds_configs[ds_name] = ds_config - self._init_dataset(ds_name, ds_config) - return self._datasets[ds_name] + if version and isinstance(dataset, AbstractVersionedDataset): + # we only want to return a similar-looking dataset, + # not modify the one stored in the current catalog + dataset = dataset._copy(_version=version) - @abc.abstractmethod - def add_from_dict(self, datasets: dict[str, Any], **kwargs) -> None: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the 'add_from_dict' method" - ) + return dataset - def add(self, ds_name: str, dataset: Any, **kwargs) -> None: - """Adds a new dataset object to the ``AbstractDataCatalog``.""" + def add( + self, ds_name: str, dataset: AbstractDataset, replace: bool = False + ) -> None: + """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" if ds_name in self._datasets: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) + if replace: + self._logger.warning("Replacing dataset '%s'", ds_name) + else: + raise DatasetAlreadyExistsError( + f"Dataset '{ds_name}' has already been registered" + ) self._datasets[ds_name] = dataset - self._resolved_ds_configs[ds_name] = {} + self._config[ds_name] = {} @property def _logger(self) -> logging.Logger: @@ -398,13 +184,6 @@ def list(self, regex_search: str | None = None) -> list[str]: ) from exc return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] - @abc.abstractmethod - def load(self, name: str, **kwargs) -> Any: - raise NotImplementedError( - f"'{self.__class__.__name__}' is a subclass of AbstractDataCatalog and " - f"it must implement the 'load' method" - ) - def save(self, name: str, data: Any) -> None: """Save data to a registered data set. @@ -448,37 +227,38 @@ def save(self, name: str, data: Any) -> None: dataset.save(data) def release(self, name: str) -> None: - pass + """Release any cached data associated with a data set - def confirm(self, name: str) -> None: - pass + Args: + name: A data set to be checked. - def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: - self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} - self._runtime_patterns = self._sort_patterns(self._runtime_patterns) + Raises: + DatasetNotFoundError: When a data set with the given name + has not yet been registered. + """ + dataset = self.get_dataset(name) + dataset.release() + def confirm(self, name: str) -> None: + """Confirm a dataset by its name. -class KedroDataCatalog(AbstractDataCatalog): - def __init__( # noqa: PLR0913 - self, - datasets: dict[str, AbstractDataset] | None = None, - config: dict[str, dict[str, Any]] | None = None, - credentials: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> None: - self._load_versions = load_versions or {} - self._save_version = save_version + Args: + name: Name of the dataset. + Raises: + DatasetError: When the dataset does not have `confirm` method. - super().__init__(datasets, config, credentials) + """ + self._logger.info("Confirming dataset '%s'", name) + dataset = self.get_dataset(name) - self._validate_missing_keys() + if hasattr(dataset, "confirm"): + dataset.confirm() + else: + raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def _validate_missing_keys(self) -> None: missing_keys = [ - key - for key in self._load_versions.keys() - if not (key in self._config or self.match_pattern(key)) + key for key in self._load_versions.keys() if key not in self._config ] if missing_keys: raise DatasetNotFoundError( @@ -486,52 +266,6 @@ def _validate_missing_keys(self) -> None: f"are not found in the catalog." ) - def _init_dataset(self, ds_name: str, config: dict[str, Any]): - # Add LazyAbstractDataset to store the configuration but not to init actual dataset - # Initialise actual dataset when load or save - # Add is_ds_init property - self._datasets[ds_name] = AbstractDataset.from_config( - ds_name, - config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def get_dataset( - self, ds_name: str, suggest: bool = True, version: Version | None = None - ) -> AbstractDataset: - dataset = super().get_dataset(ds_name, suggest) - - if version and isinstance(dataset, AbstractVersionedDataset): - # we only want to return a similar-looking dataset, - # not modify the one stored in the current catalog - dataset = dataset._copy(_version=version) - - return dataset - - def add( - self, ds_name: str, dataset: AbstractDataset, replace: bool = False - ) -> None: - """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset - self._resolved_ds_configs[ds_name] = {} - - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name in datasets: - if isinstance(datasets[ds_name], AbstractDataset): - dataset = datasets[ds_name] - else: - dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - - self.add(ds_name, dataset, replace) - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. @@ -576,32 +310,11 @@ def load(self, name: str, version: str | None = None) -> Any: return result - def release(self, name: str) -> None: - """Release any cached data associated with a data set - - Args: - name: A data set to be checked. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - """ - dataset = self.get_dataset(name) - dataset.release() - - def confirm(self, name: str) -> None: - """Confirm a dataset by its name. - - Args: - name: Name of the dataset. - Raises: - DatasetError: When the dataset does not have `confirm` method. - - """ - self._logger.info("Confirming dataset '%s'", name) - dataset = self.get_dataset(name) + def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: + for ds_name in datasets: + if isinstance(datasets[ds_name], AbstractDataset): + dataset = datasets[ds_name] + else: + dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - if hasattr(dataset, "confirm"): - dataset.confirm() - else: - raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") + self.add(ds_name, dataset, replace) From 9346f081c4a8f898499b7bb140375698a979059f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 22 Aug 2024 19:02:13 +0100 Subject: [PATCH 22/77] KedroDataCatalog updates Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index cad4950c56..59efe7c777 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -52,6 +52,7 @@ def __init__( self._runtime_patterns = {} self._load_versions = load_versions or {} self._save_version = save_version + self._use_rich_markup = _has_rich_handler() if datasets: for ds_name in datasets: @@ -59,7 +60,8 @@ def __init__( self._config[ds_name] = {} if config: - self._init_datasets(config) + for ds_name, ds_config in config.items(): + self.init_dataset(ds_name, ds_config) self._validate_missing_keys() @@ -84,7 +86,7 @@ def config(self, value: Any): def __iter__(self): yield from self._datasets.values() - def __getitem__(self, ds_name: str) -> Any: + def __getitem__(self, ds_name: str) -> AbstractDataset: return self.get_dataset(ds_name) def __contains__(self, ds_name: str) -> bool: @@ -96,25 +98,19 @@ def __contains__(self, ds_name: str) -> bool: def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def _init_datasets( - self, - config: dict[str, dict[str, Any]] | None, - ) -> None: - for ds_name, ds_config in config.items(): - validate_dataset_config(ds_name, ds_config) - self._init_dataset(ds_name, ds_config) - - def _init_dataset(self, ds_name: str, config: dict[str, Any]): + def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): # Add LazyAbstractDataset to store the configuration but not to init actual dataset # Initialise actual dataset when load or save # Add is_ds_init property + validate_dataset_config(ds_name, ds_config) if ds_name in self._datasets: raise DatasetAlreadyExistsError( f"Dataset '{ds_name}' has already been registered" ) + self._config[ds_name] = ds_config self._datasets[ds_name] = AbstractDataset.from_config( ds_name, - config, + ds_config, self._load_versions.get(ds_name), self._save_version, ) @@ -217,9 +213,7 @@ def save(self, name: str, data: Any) -> None: self._logger.info( "Saving data to %s (%s)...", - _format_rich(name, "dark_orange") - if _has_rich_handler(self._logger) - else name, + _format_rich(name, "dark_orange") if self._use_rich_markup else name, type(dataset).__name__, extra={"markup": True}, ) @@ -299,9 +293,7 @@ def load(self, name: str, version: str | None = None) -> Any: self._logger.info( "Loading data from %s (%s)...", - _format_rich(name, "dark_orange") - if _has_rich_handler(self._logger) - else name, + _format_rich(name, "dark_orange") if self._use_rich_markup else name, type(dataset).__name__, extra={"markup": True}, ) From 9568e29d12f70732376115d3e653e70bdac0eabd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 28 Aug 2024 19:07:55 +0100 Subject: [PATCH 23/77] Added property to return config Signed-off-by: Elena Khaustova --- kedro/config/config_resolver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py index 31fec7a339..9b870db21f 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/config/config_resolver.py @@ -98,6 +98,10 @@ def __init__( self._ds_configs = self._get_ds_configs(config, credentials) + @property + def config(self): + return copy.deepcopy(self._ds_configs) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From 5e27660678d4301e9e426a951ca4969b65f8c708 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 28 Aug 2024 19:12:05 +0100 Subject: [PATCH 24/77] Added list patterns method Signed-off-by: Elena Khaustova --- kedro/config/config_resolver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kedro/config/config_resolver.py b/kedro/config/config_resolver.py index 9b870db21f..19c246e6db 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/config/config_resolver.py @@ -145,11 +145,15 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] ) return {key: dataset_patterns[key] for key in sorted_keys} - def match_pattern(self, ds_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" + def list_patterns(self) -> list[str]: all_patterns = list(self._dataset_patterns.keys()) all_patterns.extend(list(self._default_pattern.keys())) all_patterns.extend(list(self._runtime_patterns.keys())) + return all_patterns + + def match_pattern(self, ds_name: str) -> str | None: + """Match a dataset name against patterns in a dictionary.""" + all_patterns = self.list_patterns() matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) From 72b11d00124d5c7b455441cff48929a2535e954c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 10:35:16 +0100 Subject: [PATCH 25/77] Renamed and moved ConfigResolver Signed-off-by: Elena Khaustova --- kedro/config/__init__.py | 2 -- kedro/io/__init__.py | 4 +++- .../config_resolver.py => io/catalog_config_resolver.py} | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename kedro/{config/config_resolver.py => io/catalog_config_resolver.py} (99%) diff --git a/kedro/config/__init__.py b/kedro/config/__init__.py index 9b47743bfe..500cd62615 100644 --- a/kedro/config/__init__.py +++ b/kedro/config/__init__.py @@ -7,13 +7,11 @@ BadConfigException, MissingConfigException, ) -from .config_resolver import ConfigResolver from .omegaconf_config import OmegaConfigLoader __all__ = [ "AbstractConfigLoader", "BadConfigException", - "ConfigResolver", "MissingConfigException", "OmegaConfigLoader", ] diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index df7880557e..5f51dc8a3b 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,6 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset +from .catalog_config_resolver import CatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -23,13 +24,14 @@ "AbstractDataset", "AbstractVersionedDataset", "CachedDataset", + "CatalogConfigResolver", "DataCatalog", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", + "KedroDataCatalog", "LambdaDataset", "MemoryDataset", "SharedMemoryDataset", "Version", - "KedroDataCatalog", ] diff --git a/kedro/config/config_resolver.py b/kedro/io/catalog_config_resolver.py similarity index 99% rename from kedro/config/config_resolver.py rename to kedro/io/catalog_config_resolver.py index 19c246e6db..88437b2532 100644 --- a/kedro/config/config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -85,7 +85,7 @@ def _resolve_config( return config -class ConfigResolver: +class CatalogConfigResolver: def __init__( self, config: dict[str, dict[str, Any]], From f0a409042ec63c3fd8e88a0906ea59f13416c580 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 11:32:43 +0100 Subject: [PATCH 26/77] Renamed ConfigResolver Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 4 ++-- kedro/io/catalog_config_resolver.py | 2 +- kedro/io/data_catalog_redesign.py | 5 +++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 5f51dc8a3b..db3c295449 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset -from .catalog_config_resolver import CatalogConfigResolver +from .catalog_config_resolver import DataCatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -24,8 +24,8 @@ "AbstractDataset", "AbstractVersionedDataset", "CachedDataset", - "CatalogConfigResolver", "DataCatalog", + "DataCatalogConfigResolver", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 88437b2532..2238165037 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -85,7 +85,7 @@ def _resolve_config( return config -class CatalogConfigResolver: +class DataCatalogConfigResolver: def __init__( self, config: dict[str, dict[str, Any]], diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 59efe7c777..13c57adbba 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -99,9 +99,9 @@ def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): - # Add LazyAbstractDataset to store the configuration but not to init actual dataset + # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save - # Add is_ds_init property + # Add is_init property validate_dataset_config(ds_name, ds_config) if ds_name in self._datasets: raise DatasetAlreadyExistsError( @@ -151,6 +151,7 @@ def add( f"Dataset '{ds_name}' has already been registered" ) self._datasets[ds_name] = dataset + # TODO: API to get configuration from dataset self._config[ds_name] = {} @property From 7d6227f3d119cbfaf44897bc949fa7552d0e12c3 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 12:17:22 +0100 Subject: [PATCH 27/77] Cleaned KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/data_catalog_redesign.py | 124 +++++++----------------------- 1 file changed, 26 insertions(+), 98 deletions(-) diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py index 13c57adbba..9618198291 100644 --- a/kedro/io/data_catalog_redesign.py +++ b/kedro/io/data_catalog_redesign.py @@ -17,19 +17,9 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -Patterns = dict[str, dict[str, Any]] - CREDENTIALS_KEY = "credentials" -class DatasetConfigurationNotFoundError(DatasetError): - """``DatasetConfigurationNotFoundError`` raised by ``DataCatalog`` class in case of - trying to get non-existing dataset configuration. - """ - - pass - - def validate_dataset_config(ds_name: str, ds_config: Any) -> None: if not isinstance(ds_config, dict): raise DatasetError( @@ -49,39 +39,38 @@ def __init__( ) -> None: self._config = config or {} self._datasets = datasets or {} - self._runtime_patterns = {} self._load_versions = load_versions or {} self._save_version = save_version self._use_rich_markup = _has_rich_handler() - if datasets: - for ds_name in datasets: - # TODO: API to get configuration from dataset - self._config[ds_name] = {} + for ds_name in self._datasets: + # TODO: API to get configuration from dataset + self._config[ds_name] = {} - if config: - for ds_name, ds_config in config.items(): - self.init_dataset(ds_name, ds_config) + for ds_name, ds_config in self._config.items(): + self.init_dataset(ds_name, ds_config) self._validate_missing_keys() @property - def datasets(self): + def datasets(self) -> dict[str, Any]: return copy.deepcopy(self._datasets) @datasets.setter def datasets(self, value: Any): - msg = "Operation not allowed! Please change datasets through configuration." - raise AttributeError(msg) + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) @property def config(self): return copy.deepcopy(self._config) @config.setter - def config(self, value: Any): - msg = "Operation not allowed! Please change datasets through configuration." - raise AttributeError(msg) + def config(self, value: Any) -> dict[str, dict[str, Any]]: + raise AttributeError( + "Operation not allowed! Please change datasets through configuration." + ) def __iter__(self): yield from self._datasets.values() @@ -91,14 +80,12 @@ def __getitem__(self, ds_name: str) -> AbstractDataset: def __contains__(self, ds_name: str) -> bool: """Check if an item is in the catalog""" - if ds_name in self._datasets: - return True - return False + return ds_name in self._datasets def _ipython_key_completions_(self) -> list[str]: return list(self._datasets.keys()) - def init_dataset(self, ds_name: str, ds_config: dict[str, Any]): + def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: # Add lazy loading feature to store the configuration but not to init actual dataset # Initialise actual dataset when load or save # Add is_init property @@ -122,11 +109,10 @@ def get_dataset( if dataset is None: error_msg = f"Dataset '{ds_name}' not found in the catalog" - # Flag to turn on/off fuzzy-matching which can be time consuming and # slow down plugins like `kedro-viz` if suggest: - matches = difflib.get_close_matches(ds_name, self._config.keys()) + matches = difflib.get_close_matches(ds_name, self._datasets.keys()) if matches: suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" @@ -174,7 +160,6 @@ def list(self, regex_search: str | None = None) -> list[str]: try: pattern = re.compile(regex_search, flags=re.IGNORECASE) - except re.error as exc: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" @@ -182,34 +167,7 @@ def list(self, regex_search: str | None = None) -> list[str]: return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def save(self, name: str, data: Any) -> None: - """Save data to a registered data set. - - Args: - name: A data set to be saved to. - data: A data object to be saved as configured in the registered - data set. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - - Example: - :: - - >>> import pandas as pd - >>> - >>> from kedro_datasets.pandas import CSVDataset - >>> - >>> cars = CSVDataset(filepath="cars.csv", - >>> load_args=None, - >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) - >>> - >>> df = pd.DataFrame({'col1': [1, 2], - >>> 'col2': [4, 5], - >>> 'col3': [5, 6]}) - >>> catalog.save("cars", df) - """ + """Save data to a registered data set.""" dataset = self.get_dataset(name) self._logger.info( @@ -252,9 +210,7 @@ def confirm(self, name: str) -> None: raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") def _validate_missing_keys(self) -> None: - missing_keys = [ - key for key in self._load_versions.keys() if key not in self._config - ] + missing_keys = [key for key in self._load_versions if key not in self._config] if missing_keys: raise DatasetNotFoundError( f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " @@ -262,33 +218,7 @@ def _validate_missing_keys(self) -> None: ) def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set. - - Args: - name: A data set to be loaded. - version: Optional argument for concrete data version to be loaded. - Works only with versioned datasets. - - Returns: - The loaded data as configured. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - - Example: - :: - - >>> from kedro.io import DataCatalog - >>> from kedro_datasets.pandas import CSVDataset - >>> - >>> cars = CSVDataset(filepath="cars.csv", - >>> load_args=None, - >>> save_args={"index": False}) - >>> catalog = DataCatalog(datasets={'cars': cars}) - >>> - >>> df = catalog.load("cars") - """ + """Loads a registered data set.""" load_version = Version(version, None) if version else None dataset = self.get_dataset(name, version=load_version) @@ -299,15 +229,13 @@ def load(self, name: str, version: str | None = None) -> Any: extra={"markup": True}, ) - result = dataset.load() - - return result + return dataset.load() def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name in datasets: - if isinstance(datasets[ds_name], AbstractDataset): - dataset = datasets[ds_name] - else: - dataset = MemoryDataset(data=datasets[ds_name]) # type: ignore[abstract] - + for ds_name, ds_data in datasets.items(): + dataset = ( + ds_data + if isinstance(ds_data, AbstractDataset) + else MemoryDataset(data=ds_data) + ) # type: ignore[abstract] self.add(ds_name, dataset, replace) From 409229164580989fef619c56ffbb9c7c9b803310 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 29 Aug 2024 14:35:13 +0100 Subject: [PATCH 28/77] Cleaned up DataCatalogConfigResolver Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 110 ++++++++++++++-------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 2238165037..921141fb61 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -10,8 +10,8 @@ CREDENTIALS_KEY = "credentials" -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. +def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Fetch the specified credentials from the provided credentials dictionary. Args: credentials_name: Credentials name. @@ -51,56 +51,57 @@ def _resolve_credentials( """ config = copy.deepcopy(config) - def _map_value(key: str, value: Any) -> Any: + def _resolve_value(key: str, value: Any) -> Any: if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) + return _fetch_credentials(value, credentials) if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} + return {k: _resolve_value(k, v) for k, v in value.items()} return value - return {k: _map_value(k, v) for k, v in config.items()} + return {k: _resolve_value(k, v) for k, v in config.items()} -def _resolve_config( +def _resolve_dataset_config( ds_name: str, - matched_pattern: str, + pattern: str, config: dict, ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, ds_name) + """Resolve dataset configuration based on the provided pattern.""" + resolved_vars = parse(pattern, ds_name) # Resolve the factory config for the dataset if isinstance(config, dict): for key, value in config.items(): - config[key] = _resolve_config(ds_name, matched_pattern, value) + config[key] = _resolve_dataset_config(ds_name, pattern, value) elif isinstance(config, (list, tuple)): - config = [_resolve_config(ds_name, matched_pattern, value) for value in config] + config = [_resolve_dataset_config(ds_name, pattern, value) for value in config] elif isinstance(config, str) and "}" in config: try: - config = str(config).format_map(result.named) + config = config.format_map(resolved_vars.named) except KeyError as exc: raise KeyError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " + f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " f"should be present in the dataset factory pattern." ) from exc return config class DataCatalogConfigResolver: + """Resolves dataset configurations based on patterns and credentials.""" + def __init__( self, config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None = None, ): - self._runtime_patterns = {} - self._dataset_patterns, self._default_pattern = self._get_patterns( + self._runtime_patterns: Patterns = {} + self._dataset_patterns, self._default_pattern = self._extract_patterns( config, credentials ) - - self._ds_configs = self._get_ds_configs(config, credentials) + self._resolved_configs = self._init_configs(config, credentials) @property - def config(self): - return copy.deepcopy(self._ds_configs) + def config(self) -> dict[str, dict[str, Any]]: + return copy.deepcopy(self._resolved_configs) @property def _logger(self) -> logging.Logger: @@ -112,18 +113,17 @@ def _is_pattern(pattern: str) -> bool: return "{" in pattern @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets.""" + def _pattern_specificity(pattern: str) -> int: + """Calculate the specificity of a pattern based on characters outside curly brackets.""" # Remove all the placeholders from the pattern and count the number of remaining chars result = re.sub(r"\{.*?\}", "", pattern) return len(result) @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: + def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: """Sort a dictionary of dataset patterns according to parsing rules. In order: - 1. Decreasing specificity (number of characters outside the curly brackets) 2. Decreasing number of placeholders (number of curly bracket pairs) 3. Alphabetically @@ -131,13 +131,13 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] sorted_keys = sorted( dataset_patterns, key=lambda pattern: ( - -(cls._specificity(pattern)), + -(cls._pattern_specificity(pattern)), -pattern.count("{"), pattern, ), ) catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 + pattern for pattern in sorted_keys if cls._pattern_specificity(pattern) == 0 ] if len(catch_all) > 1: raise ValueError( @@ -146,10 +146,12 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]] return {key: dataset_patterns[key] for key in sorted_keys} def list_patterns(self) -> list[str]: - all_patterns = list(self._dataset_patterns.keys()) - all_patterns.extend(list(self._default_pattern.keys())) - all_patterns.extend(list(self._runtime_patterns.keys())) - return all_patterns + """List al patterns available in the catalog.""" + return ( + list(self._dataset_patterns.keys()) + + list(self._default_pattern.keys()) + + list(self._runtime_patterns.keys()) + ) def match_pattern(self, ds_name: str) -> str | None: """Match a dataset name against patterns in a dictionary.""" @@ -158,57 +160,57 @@ def match_pattern(self, ds_name: str) -> str | None: return next(matches, None) @classmethod - def _get_patterns( + def _extract_patterns( cls, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> tuple[Patterns, Patterns]: - dataset_patterns = {} + """Extract and sort patterns from the configuration.""" config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} + dataset_patterns = {} user_default = {} for ds_name, ds_config in config.items(): if cls._is_pattern(ds_name): - resolved_ds_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_ds_config + resolved_config = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = resolved_config sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: + if cls._pattern_specificity(list(sorted_patterns.keys())[-1]) == 0: last_pattern = sorted_patterns.popitem() user_default = {last_pattern[0]: last_pattern[1]} return sorted_patterns, user_default - def _get_ds_configs( + def _init_configs( self, config: dict[str, dict[str, Any]], credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: + """Initialize the dataset configuration with resolved credentials.""" config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} - ds_configs = {} + resolved_configs = {} + for ds_name, ds_config in config.items(): if not self._is_pattern(ds_name): - ds_configs[ds_name] = _resolve_credentials(ds_config, credentials) + resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) - return ds_configs + return resolved_configs - def resolve_patterns( + def resolve_dataset_patterns( self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: - if isinstance(datasets, str): - datasets_lst = [datasets] - else: - datasets_lst = datasets - + """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" + datasets_lst = [datasets] if isinstance(datasets, str) else datasets resolved_configs = [] for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._ds_configs: + if matched_pattern and ds_name not in self._resolved_configs: # If the dataset is a patterned dataset, materialise it and add it to # the catalog config_copy = copy.deepcopy( @@ -217,10 +219,12 @@ def resolve_patterns( or self._runtime_patterns.get(matched_pattern) or {} ) - ds_config = _resolve_config(ds_name, matched_pattern, config_copy) + ds_config = _resolve_dataset_config( + ds_name, matched_pattern, config_copy + ) if ( - self._specificity(matched_pattern) == 0 + self._pattern_specificity(matched_pattern) == 0 and matched_pattern in self._default_pattern ): self._logger.warning( @@ -230,16 +234,14 @@ def resolve_patterns( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self._ds_configs: - resolved_configs.append(self._ds_configs.get(ds_name)) + elif ds_name in self._resolved_configs: + resolved_configs.append(self._resolved_configs.get(ds_name)) else: resolved_configs.append(None) - if isinstance(datasets, str): - return resolved_configs[0] - else: - return resolved_configs + return resolved_configs[0] if isinstance(datasets, str) else resolved_configs def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: + """Add new runtime patterns and re-sort them.""" self._runtime_patterns = {**self._runtime_patterns, **dataset_patterns} self._runtime_patterns = self._sort_patterns(self._runtime_patterns) From 63e47f98ecddea0305a02c823c317d9300a388a8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 30 Aug 2024 12:23:10 +0100 Subject: [PATCH 29/77] Docs build fix attempt Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 921141fb61..1c7a39cfa9 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import logging import re From 68f6527e4c2dc97a0feb48e4a76be21926291b27 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 12:16:06 +0100 Subject: [PATCH 30/77] Removed KedroDataCatalog Signed-off-by: Elena Khaustova --- kedro/io/__init__.py | 2 - kedro/io/data_catalog_redesign.py | 241 ------------------------------ 2 files changed, 243 deletions(-) delete mode 100644 kedro/io/data_catalog_redesign.py diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index db3c295449..5d17d6f058 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -15,7 +15,6 @@ Version, ) from .data_catalog import DataCatalog -from .data_catalog_redesign import KedroDataCatalog from .lambda_dataset import LambdaDataset from .memory_dataset import MemoryDataset from .shared_memory_dataset import SharedMemoryDataset @@ -29,7 +28,6 @@ "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", - "KedroDataCatalog", "LambdaDataset", "MemoryDataset", "SharedMemoryDataset", diff --git a/kedro/io/data_catalog_redesign.py b/kedro/io/data_catalog_redesign.py deleted file mode 100644 index 9618198291..0000000000 --- a/kedro/io/data_catalog_redesign.py +++ /dev/null @@ -1,241 +0,0 @@ -from __future__ import annotations - -import copy -import difflib -import logging -import re -from typing import Any - -from kedro.io.core import ( - AbstractDataset, - AbstractVersionedDataset, - DatasetAlreadyExistsError, - DatasetError, - DatasetNotFoundError, - Version, -) -from kedro.io.memory_dataset import MemoryDataset -from kedro.utils import _format_rich, _has_rich_handler - -CREDENTIALS_KEY = "credentials" - - -def validate_dataset_config(ds_name: str, ds_config: Any) -> None: - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - - -class KedroDataCatalog: - def __init__( - self, - datasets: dict[str, Any] | None = None, - config: dict[str, dict[str, Any]] | None = None, - load_versions: dict[str, str] | None = None, - save_version: str | None = None, - ) -> None: - self._config = config or {} - self._datasets = datasets or {} - self._load_versions = load_versions or {} - self._save_version = save_version - self._use_rich_markup = _has_rich_handler() - - for ds_name in self._datasets: - # TODO: API to get configuration from dataset - self._config[ds_name] = {} - - for ds_name, ds_config in self._config.items(): - self.init_dataset(ds_name, ds_config) - - self._validate_missing_keys() - - @property - def datasets(self) -> dict[str, Any]: - return copy.deepcopy(self._datasets) - - @datasets.setter - def datasets(self, value: Any): - raise AttributeError( - "Operation not allowed! Please change datasets through configuration." - ) - - @property - def config(self): - return copy.deepcopy(self._config) - - @config.setter - def config(self, value: Any) -> dict[str, dict[str, Any]]: - raise AttributeError( - "Operation not allowed! Please change datasets through configuration." - ) - - def __iter__(self): - yield from self._datasets.values() - - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self.get_dataset(ds_name) - - def __contains__(self, ds_name: str) -> bool: - """Check if an item is in the catalog""" - return ds_name in self._datasets - - def _ipython_key_completions_(self) -> list[str]: - return list(self._datasets.keys()) - - def init_dataset(self, ds_name: str, ds_config: dict[str, Any]) -> None: - # Add lazy loading feature to store the configuration but not to init actual dataset - # Initialise actual dataset when load or save - # Add is_init property - validate_dataset_config(ds_name, ds_config) - if ds_name in self._datasets: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._config[ds_name] = ds_config - self._datasets[ds_name] = AbstractDataset.from_config( - ds_name, - ds_config, - self._load_versions.get(ds_name), - self._save_version, - ) - - def get_dataset( - self, ds_name: str, suggest: bool = True, version: Version | None = None - ) -> AbstractDataset: - dataset = self._datasets.get(ds_name, None) - - if dataset is None: - error_msg = f"Dataset '{ds_name}' not found in the catalog" - # Flag to turn on/off fuzzy-matching which can be time consuming and - # slow down plugins like `kedro-viz` - if suggest: - matches = difflib.get_close_matches(ds_name, self._datasets.keys()) - if matches: - suggestions = ", ".join(matches) - error_msg += f" - did you mean one of these instead: {suggestions}" - raise DatasetNotFoundError(error_msg) - - if version and isinstance(dataset, AbstractVersionedDataset): - # we only want to return a similar-looking dataset, - # not modify the one stored in the current catalog - dataset = dataset._copy(_version=version) - - return dataset - - def add( - self, ds_name: str, dataset: AbstractDataset, replace: bool = False - ) -> None: - """Adds a new ``AbstractDataset`` object to the ``KedroDataCatalog``.""" - if ds_name in self._datasets: - if replace: - self._logger.warning("Replacing dataset '%s'", ds_name) - else: - raise DatasetAlreadyExistsError( - f"Dataset '{ds_name}' has already been registered" - ) - self._datasets[ds_name] = dataset - # TODO: API to get configuration from dataset - self._config[ds_name] = {} - - @property - def _logger(self) -> logging.Logger: - return logging.getLogger(__name__) - - def list(self, regex_search: str | None = None) -> list[str]: - """ - List of all dataset names registered in the catalog. - This can be filtered by providing an optional regular expression - which will only return matching keys. - """ - - if regex_search is None: - return list(self._datasets.keys()) - - if not regex_search.strip(): - self._logger.warning("The empty string will not match any data sets") - return [] - - try: - pattern = re.compile(regex_search, flags=re.IGNORECASE) - except re.error as exc: - raise SyntaxError( - f"Invalid regular expression provided: '{regex_search}'" - ) from exc - return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] - - def save(self, name: str, data: Any) -> None: - """Save data to a registered data set.""" - dataset = self.get_dataset(name) - - self._logger.info( - "Saving data to %s (%s)...", - _format_rich(name, "dark_orange") if self._use_rich_markup else name, - type(dataset).__name__, - extra={"markup": True}, - ) - - dataset.save(data) - - def release(self, name: str) -> None: - """Release any cached data associated with a data set - - Args: - name: A data set to be checked. - - Raises: - DatasetNotFoundError: When a data set with the given name - has not yet been registered. - """ - dataset = self.get_dataset(name) - dataset.release() - - def confirm(self, name: str) -> None: - """Confirm a dataset by its name. - - Args: - name: Name of the dataset. - Raises: - DatasetError: When the dataset does not have `confirm` method. - - """ - self._logger.info("Confirming dataset '%s'", name) - dataset = self.get_dataset(name) - - if hasattr(dataset, "confirm"): - dataset.confirm() - else: - raise DatasetError(f"Dataset '{name}' does not have 'confirm' method") - - def _validate_missing_keys(self) -> None: - missing_keys = [key for key in self._load_versions if key not in self._config] - if missing_keys: - raise DatasetNotFoundError( - f"'load_versions' keys [{', '.join(sorted(missing_keys))}] " - f"are not found in the catalog." - ) - - def load(self, name: str, version: str | None = None) -> Any: - """Loads a registered data set.""" - load_version = Version(version, None) if version else None - dataset = self.get_dataset(name, version=load_version) - - self._logger.info( - "Loading data from %s (%s)...", - _format_rich(name, "dark_orange") if self._use_rich_markup else name, - type(dataset).__name__, - extra={"markup": True}, - ) - - return dataset.load() - - def add_from_dict(self, datasets: dict[str, Any], replace: bool = False) -> None: - for ds_name, ds_data in datasets.items(): - dataset = ( - ds_data - if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) - ) # type: ignore[abstract] - self.add(ds_name, dataset, replace) From 2ac4a2f211ec52c77430bb770d22fa43c345b9c9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 15:21:48 +0100 Subject: [PATCH 31/77] Updated from_config method Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 16 +++++-- kedro/io/data_catalog.py | 69 +++++++++++++---------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 1c7a39cfa9..aaa102eddd 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -92,7 +92,7 @@ class DataCatalogConfigResolver: def __init__( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None = None, credentials: dict[str, dict[str, Any]] | None = None, ): self._runtime_patterns: Patterns = {} @@ -103,14 +103,22 @@ def __init__( @property def config(self) -> dict[str, dict[str, Any]]: - return copy.deepcopy(self._resolved_configs) + return self._resolved_configs + + @property + def dataset_patterns(self) -> Patterns: + return self._dataset_patterns + + @property + def default_pattern(self) -> Patterns: + return self._default_pattern @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @staticmethod - def _is_pattern(pattern: str) -> bool: + def is_pattern(pattern: str) -> bool: """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" return "{" in pattern @@ -189,7 +197,7 @@ def _extract_patterns( def _init_configs( self, - config: dict[str, dict[str, Any]], + config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index d3fd163230..6be3d2905e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -16,6 +16,7 @@ from parse import parse +from kedro.io import DataCatalogConfigResolver from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -162,9 +163,10 @@ def __init__( # noqa: PLR0913 datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, dataset_patterns: Patterns | None = None, + default_pattern: Patterns | None = None, load_versions: dict[str, str] | None = None, save_version: str | None = None, - default_pattern: Patterns | None = None, + config_resolver: DataCatalogConfigResolver = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -195,6 +197,8 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. + config_resolver: + Example: :: @@ -206,14 +210,12 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ + self._config_resolver = config_resolver or DataCatalogConfigResolver() self._datasets = dict(datasets or {}) + self._datasets_config = self._config_resolver.config self.datasets = _FrozenDatasets(self._datasets) - # Keep a record of all patterns in the catalog. - # {dataset pattern name : dataset pattern body} - self._dataset_patterns = dataset_patterns or {} self._load_versions = load_versions or {} self._save_version = save_version - self._default_pattern = default_pattern or {} self._use_rich_markup = _has_rich_handler() if feed_dict: @@ -304,43 +306,26 @@ class to be loaded is specified with the key ``type`` and their >>> catalog.save("boats", df) """ datasets = {} - dataset_patterns = {} - catalog = copy.deepcopy(catalog) or {} - credentials = copy.deepcopy(credentials) or {} + config_resolver = DataCatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} - user_default = {} - - for ds_name, ds_config in catalog.items(): - if not isinstance(ds_config, dict): - raise DatasetError( - f"Catalog entry '{ds_name}' is not a valid dataset configuration. " - "\nHint: If this catalog entry is intended for variable interpolation, " - "make sure that the key is preceded by an underscore." - ) - ds_config = _resolve_credentials( # noqa: PLW2901 - ds_config, credentials - ) - if cls._is_pattern(ds_name): - # Add each factory to the dataset_patterns dict. - dataset_patterns[ds_name] = ds_config - - else: + for ds_name in catalog: + if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( - ds_name, ds_config, load_versions.get(ds_name), save_version + ds_name, + config_resolver.config[ds_name], + load_versions.get(ds_name), + save_version, ) - sorted_patterns = cls._sort_patterns(dataset_patterns) - if sorted_patterns: - # If the last pattern is a catch-all pattern, pop it and set it as the default - if cls._specificity(list(sorted_patterns.keys())[-1]) == 0: - last_pattern = sorted_patterns.popitem() - user_default = {last_pattern[0]: last_pattern[1]} missing_keys = [ - key - for key in load_versions.keys() - if not (key in catalog or cls._match_pattern(sorted_patterns, key)) + ds_name + for ds_name in load_versions + if not ( + ds_name in config_resolver.config + or config_resolver.match_pattern(ds_name) + ) ] if missing_keys: raise DatasetNotFoundError( @@ -350,10 +335,11 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, - dataset_patterns=sorted_patterns, + dataset_patterns=config_resolver.dataset_patterns, + default_pattern=config_resolver.default_pattern, load_versions=load_versions, save_version=save_version, - default_pattern=user_default, + config_resolver=config_resolver, ) @staticmethod @@ -619,7 +605,11 @@ def release(self, name: str) -> None: dataset.release() def add( - self, dataset_name: str, dataset: AbstractDataset, replace: bool = False + self, + dataset_name: str, + dataset: AbstractDataset, + dataset_config: dict[str, Any] | None = None, + replace: bool = False, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -628,6 +618,7 @@ def add( registered yet. dataset: A data set object to be associated with the given data set name. + dataset_config: A dictionary with dataset configuration. replace: Specifies whether to replace an existing dataset with the same name is allowed. @@ -654,6 +645,8 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset + if dataset_config is not None: + self._datasets_config[dataset_name] = dataset_config self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( From cb5879d5067e6a9020ae0b3057a550ab5784d430 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 15:48:59 +0100 Subject: [PATCH 32/77] Updated constructor and add methods Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 6be3d2905e..a78a7f2177 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -197,7 +197,7 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. - config_resolver: + config_resolver: An instance of DataCatalogConfigResolver to resolve dataset patterns and configurations. Example: @@ -211,11 +211,15 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets = dict(datasets or {}) self._datasets_config = self._config_resolver.config - self.datasets = _FrozenDatasets(self._datasets) + self._datasets = {} + self.datasets = {} + + self.add_all(dict(datasets) or {}, datasets_configs=self._datasets_config) + self._load_versions = load_versions or {} self._save_version = save_version + self._use_rich_markup = _has_rich_handler() if feed_dict: @@ -608,8 +612,8 @@ def add( self, dataset_name: str, dataset: AbstractDataset, - dataset_config: dict[str, Any] | None = None, replace: bool = False, + dataset_config: dict[str, Any] | None = None, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -618,9 +622,10 @@ def add( registered yet. dataset: A data set object to be associated with the given data set name. - dataset_config: A dictionary with dataset configuration. replace: Specifies whether to replace an existing dataset with the same name is allowed. + dataset_config: A dictionary with dataset configuration. + Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -645,12 +650,16 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset - if dataset_config is not None: - self._datasets_config[dataset_name] = dataset_config + self._datasets_config[dataset_name] = ( + dataset_config if dataset_config is not None else {} + ) self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( - self, datasets: dict[str, AbstractDataset], replace: bool = False + self, + datasets: dict[str, AbstractDataset], + replace: bool = False, + datasets_configs: dict[str, dict[str, Any]] | None = None, ) -> None: """Adds a group of new data sets to the ``DataCatalog``. @@ -659,6 +668,7 @@ def add_all( instances. replace: Specifies whether to replace an existing dataset with the same name is allowed. + datasets_configs: A dictionary of dataset configurations. Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -681,8 +691,8 @@ def add_all( >>> >>> assert catalog.list() == ["cars", "planes", "boats"] """ - for name, dataset in datasets.items(): - self.add(name, dataset, replace) + for ds_name, ds in datasets.items(): + self.add(ds_name, ds, replace, datasets_configs.get(ds_name, {})) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: """Add datasets to the ``DataCatalog`` using the data provided through the `feed_dict`. @@ -719,13 +729,13 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non >>> >>> assert catalog.load("data_csv_dataset").equals(df) """ - for dataset_name in feed_dict: - if isinstance(feed_dict[dataset_name], AbstractDataset): - dataset = feed_dict[dataset_name] - else: - dataset = MemoryDataset(data=feed_dict[dataset_name]) # type: ignore[abstract] - - self.add(dataset_name, dataset, replace) + for ds_name, ds_data in feed_dict.items(): + dataset = ( + ds_data + if isinstance(ds_data, AbstractDataset) + else MemoryDataset(data=ds_data) + ) # type: ignore[abstract] + self.add(ds_name, dataset, replace) def list(self, regex_search: str | None = None) -> list[str]: """ From 9038e963281333ebe10316d297bce0ef723388fd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:14:13 +0100 Subject: [PATCH 33/77] Updated _get_dataset method Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +- kedro/io/data_catalog.py | 174 ++-------------------------- 2 files changed, 11 insertions(+), 169 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index aaa102eddd..846ef6277f 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -182,7 +182,7 @@ def _extract_patterns( user_default = {} for ds_name, ds_config in config.items(): - if cls._is_pattern(ds_name): + if cls.is_pattern(ds_name): resolved_config = _resolve_credentials(ds_config, credentials) dataset_patterns[ds_name] = resolved_config @@ -206,12 +206,12 @@ def _init_configs( resolved_configs = {} for ds_name, ds_config in config.items(): - if not self._is_pattern(ds_name): + if not self.is_pattern(ds_name): resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) return resolved_configs - def resolve_dataset_patterns( + def resolve_dataset_pattern( self, datasets: str | list[str] ) -> dict[str, Any] | list[dict[str, Any]]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a78a7f2177..56e046da68 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -14,8 +14,6 @@ import re from typing import Any, Dict -from parse import parse - from kedro.io import DataCatalogConfigResolver from kedro.io.core import ( AbstractDataset, @@ -36,57 +34,6 @@ WORDS_REGEX_PATTERN = re.compile(r"\W+") -def _get_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Return a set of credentials from the provided credentials dict. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://docs.kedro.org/en/stable/api/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _map_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _get_credentials(value, credentials) - if isinstance(value, dict): - return {k: _map_value(k, v) for k, v in value.items()} - return value - - return {k: _map_value(k, v) for k, v in config.items()} - - def _sub_nonword_chars(dataset_name: str) -> str: """Replace non-word characters in data set names since Kedro 0.16.2. @@ -346,101 +293,22 @@ class to be loaded is specified with the key ``type`` and their config_resolver=config_resolver, ) - @staticmethod - def _is_pattern(pattern: str) -> bool: - """Check if a given string is a pattern. Assume that any name with '{' is a pattern.""" - return "{" in pattern - - @staticmethod - def _match_pattern(dataset_patterns: Patterns, dataset_name: str) -> str | None: - """Match a dataset name against patterns in a dictionary.""" - matches = ( - pattern - for pattern in dataset_patterns.keys() - if parse(pattern, dataset_name) - ) - return next(matches, None) - - @classmethod - def _sort_patterns(cls, dataset_patterns: Patterns) -> dict[str, dict[str, Any]]: - """Sort a dictionary of dataset patterns according to parsing rules. - - In order: - - 1. Decreasing specificity (number of characters outside the curly brackets) - 2. Decreasing number of placeholders (number of curly bracket pairs) - 3. Alphabetically - """ - sorted_keys = sorted( - dataset_patterns, - key=lambda pattern: ( - -(cls._specificity(pattern)), - -pattern.count("{"), - pattern, - ), - ) - catch_all = [ - pattern for pattern in sorted_keys if cls._specificity(pattern) == 0 - ] - if len(catch_all) > 1: - raise DatasetError( - f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." - ) - return {key: dataset_patterns[key] for key in sorted_keys} - - @staticmethod - def _specificity(pattern: str) -> int: - """Helper function to check the length of exactly matched characters not inside brackets. - - Example: - :: - - >>> specificity("{namespace}.companies") = 10 - >>> specificity("{namespace}.{dataset}") = 1 - >>> specificity("france.companies") = 16 - """ - # Remove all the placeholders from the pattern and count the number of remaining chars - result = re.sub(r"\{.*?\}", "", pattern) - return len(result) - def _get_dataset( self, dataset_name: str, version: Version | None = None, suggest: bool = True, ) -> AbstractDataset: - matched_pattern = self._match_pattern( - self._dataset_patterns, dataset_name - ) or self._match_pattern(self._default_pattern, dataset_name) - if dataset_name not in self._datasets and matched_pattern: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or {} - ) - dataset_config = self._resolve_config( - dataset_name, matched_pattern, config_copy - ) - dataset = AbstractDataset.from_config( + ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) + + if ds_config is not None: + ds = AbstractDataset.from_config( dataset_name, - dataset_config, + ds_config, self._load_versions.get(dataset_name), self._save_version, ) - if ( - self._specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - dataset_name, - ) - - self.add(dataset_name, dataset) + self.add(dataset_name, ds, dataset_config=ds_config) if dataset_name not in self._datasets: error_msg = f"Dataset '{dataset_name}' not found in the catalog" @@ -452,7 +320,9 @@ def _get_dataset( suggestions = ", ".join(matches) error_msg += f" - did you mean one of these instead: {suggestions}" raise DatasetNotFoundError(error_msg) + dataset = self._datasets[dataset_name] + if version and isinstance(dataset, AbstractVersionedDataset): # we only want to return a similar-looking dataset, # not modify the one stored in the current catalog @@ -467,34 +337,6 @@ def __contains__(self, dataset_name: str) -> bool: return True return False - @classmethod - def _resolve_config( - cls, - dataset_name: str, - matched_pattern: str, - config: dict, - ) -> dict[str, Any]: - """Get resolved AbstractDataset from a factory config""" - result = parse(matched_pattern, dataset_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = cls._resolve_config(dataset_name, matched_pattern, value) - elif isinstance(config, (list, tuple)): - config = [ - cls._resolve_config(dataset_name, matched_pattern, value) - for value in config - ] - elif isinstance(config, str) and "}" in config: - try: - config = str(config).format_map(result.named) - except KeyError as exc: - raise DatasetError( - f"Unable to resolve '{config}' from the pattern '{matched_pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. From cc89565db750273b44be00e2d1b796859c017eaf Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:18:58 +0100 Subject: [PATCH 34/77] Updated __contains__ Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 56e046da68..2986b0b6a1 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -332,10 +332,9 @@ def _get_dataset( def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - matched_pattern = self._match_pattern(self._dataset_patterns, dataset_name) - if dataset_name in self._datasets or matched_pattern: - return True - return False + return dataset_name in self._datasets or self._config_resolver.match_pattern( + dataset_name + ) def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. From 59b676468a36b69079af6f500a0b14b197b71065 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:31:07 +0100 Subject: [PATCH 35/77] Updated __eq__ and shallow_copy Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 2986b0b6a1..5d268e282e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -631,26 +631,21 @@ def shallow_copy( Returns: Copy of the current object. """ - if not self._default_pattern and extra_dataset_patterns: - unsorted_dataset_patterns = { - **self._dataset_patterns, - **extra_dataset_patterns, - } - dataset_patterns = self._sort_patterns(unsorted_dataset_patterns) - else: - dataset_patterns = self._dataset_patterns + if extra_dataset_patterns: + self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self.__class__( datasets=self._datasets, - dataset_patterns=dataset_patterns, + dataset_patterns=self._config_resolver.dataset_patterns, + default_pattern=self._config_resolver.default_pattern, load_versions=self._load_versions, save_version=self._save_version, - default_pattern=self._default_pattern, + config_resolver=self._config_resolver, ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._dataset_patterns) == ( + return (self._datasets, self._config_resolver.dataset_patterns) == ( other._datasets, - other._dataset_patterns, + other._config_resolver._dataset_patterns, ) def confirm(self, name: str) -> None: From 4f5a3fbb8d1f8f1da8c4000af80d77245c948a8c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:36:50 +0100 Subject: [PATCH 36/77] Added __iter__ and __getitem__ Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 5d268e282e..883d5da78e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -175,6 +175,24 @@ def __init__( # noqa: PLR0913 def __repr__(self) -> str: return self.datasets.__repr__() + def __iter__(self): + yield from self._datasets.values() + + def __getitem__(self, ds_name: str) -> AbstractDataset: + return self._get_dataset(ds_name) + + def __contains__(self, dataset_name: str) -> bool: + """Check if an item is in the catalog as a materialised dataset or pattern""" + return dataset_name in self._datasets or self._config_resolver.match_pattern( + dataset_name + ) + + def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] + return (self._datasets, self._config_resolver.dataset_patterns) == ( + other._datasets, + other._config_resolver._dataset_patterns, + ) + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -330,12 +348,6 @@ def _get_dataset( return dataset - def __contains__(self, dataset_name: str) -> bool: - """Check if an item is in the catalog as a materialised dataset or pattern""" - return dataset_name in self._datasets or self._config_resolver.match_pattern( - dataset_name - ) - def load(self, name: str, version: str | None = None) -> Any: """Loads a registered data set. @@ -621,7 +633,7 @@ def list(self, regex_search: str | None = None) -> list[str]: raise SyntaxError( f"Invalid regular expression provided: '{regex_search}'" ) from exc - return [dset_name for dset_name in self._datasets if pattern.search(dset_name)] + return [ds_name for ds_name in self._datasets if pattern.search(ds_name)] def shallow_copy( self, extra_dataset_patterns: Patterns | None = None @@ -642,12 +654,6 @@ def shallow_copy( config_resolver=self._config_resolver, ) - def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._config_resolver.dataset_patterns) == ( - other._datasets, - other._config_resolver._dataset_patterns, - ) - def confirm(self, name: str) -> None: """Confirm a dataset by its name. From 12ed6f2e39a2e3737f905323be8cb2f7c87daef6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:39:24 +0100 Subject: [PATCH 37/77] Removed unused imports Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 883d5da78e..7218ccaff5 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -12,9 +12,9 @@ import logging import pprint import re -from typing import Any, Dict +from typing import Any -from kedro.io import DataCatalogConfigResolver +from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, @@ -27,10 +27,6 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler -Patterns = Dict[str, Dict[str, Any]] - -CATALOG_KEY = "catalog" -CREDENTIALS_KEY = "credentials" WORDS_REGEX_PATTERN = re.compile(r"\W+") From a106cec92eda2ca74940d92b4ae872a86797f883 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:47:38 +0100 Subject: [PATCH 38/77] Added TODO Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 846ef6277f..1236fd397b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -66,7 +66,7 @@ def _resolve_value(key: str, value: Any) -> Any: def _resolve_dataset_config( ds_name: str, pattern: str, - config: dict, + config: Any, ) -> dict[str, Any]: """Resolve dataset configuration based on the provided pattern.""" resolved_vars = parse(pattern, ds_name) @@ -201,6 +201,7 @@ def _init_configs( credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" + # TODO: check if deep copies are required config = copy.deepcopy(config) or {} credentials = copy.deepcopy(credentials) or {} resolved_configs = {} From 6df04f77ed69efcd5e1b3f3718dc4ce0aa1c92b6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:52:37 +0100 Subject: [PATCH 39/77] Updated runner.run() Signed-off-by: Elena Khaustova --- kedro/runner/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 2ffd0389e4..6f165e87c0 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -83,7 +83,6 @@ def run( """ hook_or_null_manager = hook_manager or _NullPluginManager() - catalog = catalog.shallow_copy() # Check which datasets used in the pipeline are in the catalog or match # a pattern in the catalog From 8566e279b28fc7ae966ec0584eedc6e096098b7b Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 16:59:00 +0100 Subject: [PATCH 40/77] Updated session Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 91928f7c4b..25a0f46896 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -397,10 +397,7 @@ def run( # noqa: PLR0913 try: if isinstance(runner, ThreadRunner): for ds in filtered_pipeline.datasets(): - if catalog._match_pattern( - catalog._dataset_patterns, ds - ) or catalog._match_pattern(catalog._default_pattern, ds): - _ = catalog._get_dataset(ds) + _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) From 2dcea33ad36c86e83072b11d0706d962d9406e8c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:20:10 +0100 Subject: [PATCH 41/77] Added confil_resolver property Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 3 ++- kedro/io/data_catalog.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 25a0f46896..2b13cd1694 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -397,7 +397,8 @@ def run( # noqa: PLR0913 try: if isinstance(runner, ThreadRunner): for ds in filtered_pipeline.datasets(): - _ = catalog._get_dataset(ds) + if catalog.config_resolver.match_pattern(ds): + _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7218ccaff5..abf9bf6353 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -189,6 +189,10 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] other._config_resolver._dataset_patterns, ) + @property + def config_resolver(self): + return self._config_resolver + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -315,7 +319,7 @@ def _get_dataset( ) -> AbstractDataset: ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) - if ds_config is not None: + if dataset_name not in self._datasets and ds_config is not None: ds = AbstractDataset.from_config( dataset_name, ds_config, From a46597fec0637f1fd268087bf0d65969bc39b3d7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:31:02 +0100 Subject: [PATCH 42/77] Updated catalog list command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 36 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 223980dade..01170dbd7c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -28,6 +28,11 @@ def _create_session(package_name: str, **kwargs: Any) -> KedroSession: return KedroSession.create(**kwargs) +def is_parameter(dataset_name: str) -> bool: + """Check if dataset is a parameter.""" + return dataset_name.startswith("params:") or dataset_name == "parameters" + + @click.group(name="Kedro") def catalog_cli() -> None: # pragma: no cover pass @@ -88,21 +93,15 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: # resolve any factory datasets in the pipeline factory_ds_by_type = defaultdict(list) - for ds_name in default_ds: - matched_pattern = data_catalog._match_pattern( - data_catalog._dataset_patterns, ds_name - ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name) - if matched_pattern: - ds_config_copy = copy.deepcopy( - data_catalog._dataset_patterns.get(matched_pattern) - or data_catalog._default_pattern.get(matched_pattern) - or {} - ) - ds_config = data_catalog._resolve_config( - ds_name, matched_pattern, ds_config_copy + resolved_configs = data_catalog.config_resolver.resolve_dataset_pattern( + default_ds + ) + for ds_name, ds_config in zip(default_ds, resolved_configs): + if data_catalog.config_resolver.match_pattern(ds_name): + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( + ds_name ) - factory_ds_by_type[ds_config["type"]].append(ds_name) default_ds = default_ds - set(chain.from_iterable(factory_ds_by_type.values())) @@ -128,12 +127,11 @@ def _map_type_to_datasets( datasets of the specific type as a value. """ mapping = defaultdict(list) # type: ignore[var-annotated] - for dataset in datasets: - is_param = dataset.startswith("params:") or dataset == "parameters" - if not is_param: - ds_type = datasets_meta[dataset].__class__.__name__ - if dataset not in mapping[ds_type]: - mapping[ds_type].append(dataset) + for dataset_name in datasets: + if not is_parameter(dataset_name): + ds_type = datasets_meta[dataset_name].__class__.__name__ + if dataset_name not in mapping[ds_type]: + mapping[ds_type].append(dataset_name) return mapping From 3787545f8b6afb2e5d83e9a1ae7d3b7db48337ab Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:38:03 +0100 Subject: [PATCH 43/77] Updated catalog create command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 01170dbd7c..cebbdb9b28 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -168,20 +168,16 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name: str, env: str) -> N f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}" ) - pipe_datasets = { - ds_name - for ds_name in pipeline.datasets() - if not ds_name.startswith("params:") and ds_name != "parameters" + pipeline_datasets = { + ds_name for ds_name in pipeline.datasets() if not is_parameter(ds_name) } catalog_datasets = { - ds_name - for ds_name in context.catalog._datasets.keys() - if not ds_name.startswith("params:") and ds_name != "parameters" + ds_name for ds_name in context.catalog.list() if not is_parameter(ds_name) } # Datasets that are missing in Data Catalog - missing_ds = sorted(pipe_datasets - catalog_datasets) + missing_ds = sorted(pipeline_datasets - catalog_datasets) if missing_ds: catalog_path = ( context.project_path From 68d612d34c2276d3d8d00a24207ffeb38d34da99 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:41:04 +0100 Subject: [PATCH 44/77] Updated catalog rank command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index cebbdb9b28..c4b701f56e 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -215,12 +215,14 @@ def rank_catalog_factories(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - catalog_factories = { - **context.catalog._dataset_patterns, - **context.catalog._default_pattern, - } + catalog_factories = list( + { + **context.catalog.config_resolver.dataset_patterns, + **context.catalog.config_resolver.default_pattern, + }.keys() + ) if catalog_factories: - click.echo(yaml.dump(list(catalog_factories.keys()))) + click.echo(yaml.dump(catalog_factories)) else: click.echo("There are no dataset factories in the catalog.") From af5bee9df3e10f79a7ee797d4746b7dbd02ed415 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 18:54:29 +0100 Subject: [PATCH 45/77] Updated catalog resolve command Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 29 +++++++++-------------------- kedro/io/data_catalog.py | 6 +++++- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index c4b701f56e..297e0dcb9e 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -2,7 +2,6 @@ from __future__ import annotations -import copy from collections import defaultdict from itertools import chain from typing import TYPE_CHECKING, Any @@ -245,36 +244,26 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: explicit_datasets = { ds_name: ds_config - for ds_name, ds_config in catalog_config.items() - if not data_catalog._is_pattern(ds_name) + for ds_name, ds_config in data_catalog.datasets_config.items() + if not is_parameter(ds_name) } target_pipelines = pipelines.keys() - datasets = set() + pipeline_datasets = set() for pipe in target_pipelines: pl_obj = pipelines.get(pipe) if pl_obj: - datasets.update(pl_obj.datasets()) + pipeline_datasets.update(pl_obj.datasets()) - for ds_name in datasets: - is_param = ds_name.startswith("params:") or ds_name == "parameters" - if ds_name in explicit_datasets or is_param: + for ds_name in pipeline_datasets: + if ds_name in explicit_datasets or is_parameter(ds_name): continue - matched_pattern = data_catalog._match_pattern( - data_catalog._dataset_patterns, ds_name - ) or data_catalog._match_pattern(data_catalog._default_pattern, ds_name) - if matched_pattern: - ds_config_copy = copy.deepcopy( - data_catalog._dataset_patterns.get(matched_pattern) - or data_catalog._default_pattern.get(matched_pattern) - or {} - ) + ds_config = data_catalog.config_resolver.resolve_dataset_pattern(ds_name) - ds_config = data_catalog._resolve_config( - ds_name, matched_pattern, ds_config_copy - ) + # Exclude MemoryDatasets not set in the catalog explicitly + if ds_config is not None: explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index abf9bf6353..75cd65bf67 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -190,9 +190,13 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] ) @property - def config_resolver(self): + def config_resolver(self) -> DataCatalogConfigResolver: return self._config_resolver + @property + def datasets_config(self) -> dict[str, dict[str, Any]]: + return self._datasets_config + @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) From e67ff0f7a16f22006631f80a54243dde69d25388 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Thu, 5 Sep 2024 19:02:36 +0100 Subject: [PATCH 46/77] Remove some methods Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 75cd65bf67..9b7ff7404e 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -171,12 +171,6 @@ def __init__( # noqa: PLR0913 def __repr__(self) -> str: return self.datasets.__repr__() - def __iter__(self): - yield from self._datasets.values() - - def __getitem__(self, ds_name: str) -> AbstractDataset: - return self._get_dataset(ds_name) - def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" return dataset_name in self._datasets or self._config_resolver.match_pattern( From 7b3afa21862089c4856a1b9a73ae726499eae091 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 11:40:22 +0100 Subject: [PATCH 47/77] Removed ds configs from catalog Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 4 ++-- kedro/io/data_catalog.py | 27 +++++++-------------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 297e0dcb9e..900d34d7e2 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -244,8 +244,8 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: explicit_datasets = { ds_name: ds_config - for ds_name, ds_config in data_catalog.datasets_config.items() - if not is_parameter(ds_name) + for ds_name, ds_config in catalog_config.items() + if not data_catalog.config_resolver.is_pattern(ds_name) } target_pipelines = pipelines.keys() diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 9b7ff7404e..e56610b5c2 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -105,8 +105,8 @@ def __init__( # noqa: PLR0913 self, datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, - dataset_patterns: Patterns | None = None, - default_pattern: Patterns | None = None, + dataset_patterns: Patterns | None = None, # Kept for interface compatibility + default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, config_resolver: DataCatalogConfigResolver = None, @@ -115,7 +115,7 @@ def __init__( # noqa: PLR0913 implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a - single point of reference for your calls, relaying load and save + single point of reference for your calls, relaying load and saveĀ§ functions to the underlying data sets. Args: @@ -154,11 +154,10 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets_config = self._config_resolver.config self._datasets = {} self.datasets = {} - self.add_all(dict(datasets) or {}, datasets_configs=self._datasets_config) + self.add_all(dict(datasets) or {}) self._load_versions = load_versions or {} self._save_version = save_version @@ -180,17 +179,13 @@ def __contains__(self, dataset_name: str) -> bool: def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] return (self._datasets, self._config_resolver.dataset_patterns) == ( other._datasets, - other._config_resolver._dataset_patterns, + other._config_resolver.dataset_patterns, ) @property def config_resolver(self) -> DataCatalogConfigResolver: return self._config_resolver - @property - def datasets_config(self) -> dict[str, dict[str, Any]]: - return self._datasets_config - @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) @@ -324,7 +319,7 @@ def _get_dataset( self._load_versions.get(dataset_name), self._save_version, ) - self.add(dataset_name, ds, dataset_config=ds_config) + self.add(dataset_name, ds) if dataset_name not in self._datasets: error_msg = f"Dataset '{dataset_name}' not found in the catalog" @@ -464,7 +459,6 @@ def add( dataset_name: str, dataset: AbstractDataset, replace: bool = False, - dataset_config: dict[str, Any] | None = None, ) -> None: """Adds a new ``AbstractDataset`` object to the ``DataCatalog``. @@ -475,8 +469,6 @@ def add( set name. replace: Specifies whether to replace an existing dataset with the same name is allowed. - dataset_config: A dictionary with dataset configuration. - Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -501,16 +493,12 @@ def add( f"Dataset '{dataset_name}' has already been registered" ) self._datasets[dataset_name] = dataset - self._datasets_config[dataset_name] = ( - dataset_config if dataset_config is not None else {} - ) self.datasets = _FrozenDatasets(self.datasets, {dataset_name: dataset}) def add_all( self, datasets: dict[str, AbstractDataset], replace: bool = False, - datasets_configs: dict[str, dict[str, Any]] | None = None, ) -> None: """Adds a group of new data sets to the ``DataCatalog``. @@ -519,7 +507,6 @@ def add_all( instances. replace: Specifies whether to replace an existing dataset with the same name is allowed. - datasets_configs: A dictionary of dataset configurations. Raises: DatasetAlreadyExistsError: When a data set with the same name @@ -543,7 +530,7 @@ def add_all( >>> assert catalog.list() == ["cars", "planes", "boats"] """ for ds_name, ds in datasets.items(): - self.add(ds_name, ds, replace, datasets_configs.get(ds_name, {})) + self.add(ds_name, ds, replace) def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> None: """Add datasets to the ``DataCatalog`` using the data provided through the `feed_dict`. From 658a759628d2e55e958790229a5f7fb7c637bbfd Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 12:41:49 +0100 Subject: [PATCH 48/77] Fixed lint Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 2 +- kedro/io/catalog_config_resolver.py | 19 +++++++++++-------- kedro/io/data_catalog.py | 24 ++++++++++++++---------- kedro/runner/parallel_runner.py | 6 ++++-- kedro/runner/runner.py | 3 ++- kedro/runner/sequential_runner.py | 6 ++++-- kedro/runner/thread_runner.py | 6 ++++-- 7 files changed, 40 insertions(+), 26 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 900d34d7e2..37ffebd13c 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -98,7 +98,7 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: ) for ds_name, ds_config in zip(default_ds, resolved_configs): if data_catalog.config_resolver.match_pattern(ds_name): - factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( # type: ignore[attr-defined] ds_name ) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 1236fd397b..0aaae32af8 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -7,12 +7,14 @@ from parse import parse -Patterns = dict[str, dict[str, Any]] +Patterns = dict[str, dict[str, Any] | None] CREDENTIALS_KEY = "credentials" -def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: +def _fetch_credentials( + credentials_name: str, credentials: dict[str, Any] | None +) -> Any: """Fetch the specified credentials from the provided credentials dictionary. Args: @@ -27,6 +29,8 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An registered. """ + if credentials is None: + return None try: return credentials[credentials_name] except KeyError as exc: @@ -40,7 +44,7 @@ def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> An def _resolve_credentials( config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any]: +) -> dict[str, Any] | None: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -67,7 +71,7 @@ def _resolve_dataset_config( ds_name: str, pattern: str, config: Any, -) -> dict[str, Any]: +) -> Any: """Resolve dataset configuration based on the provided pattern.""" resolved_vars = parse(pattern, ds_name) # Resolve the factory config for the dataset @@ -102,7 +106,7 @@ def __init__( self._resolved_configs = self._init_configs(config, credentials) @property - def config(self) -> dict[str, dict[str, Any]]: + def config(self) -> dict[str, dict[str, Any] | None]: return self._resolved_configs @property @@ -183,8 +187,7 @@ def _extract_patterns( for ds_name, ds_config in config.items(): if cls.is_pattern(ds_name): - resolved_config = _resolve_credentials(ds_config, credentials) - dataset_patterns[ds_name] = resolved_config + dataset_patterns[ds_name] = _resolve_credentials(ds_config, credentials) sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -199,7 +202,7 @@ def _init_configs( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - ) -> dict[str, dict[str, Any]]: + ) -> dict[str, dict[str, Any] | None]: """Initialize the dataset configuration with resolved credentials.""" # TODO: check if deep copies are required config = copy.deepcopy(config) or {} diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index e56610b5c2..be488a71c4 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -47,13 +47,15 @@ class _FrozenDatasets: def __init__( self, - *datasets_collections: _FrozenDatasets | dict[str, AbstractDataset], + *datasets_collections: _FrozenDatasets | dict[str, AbstractDataset] | None, ): """Return a _FrozenDatasets instance from some datasets collections. Each collection could either be another _FrozenDatasets or a dictionary. """ self._original_names: dict[str, str] = {} for collection in datasets_collections: + if collection is None: + continue if isinstance(collection, _FrozenDatasets): self.__dict__.update(collection.__dict__) self._original_names.update(collection._original_names) @@ -109,7 +111,7 @@ def __init__( # noqa: PLR0913 default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: DataCatalogConfigResolver = None, + config_resolver: DataCatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -154,10 +156,10 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or DataCatalogConfigResolver() - self._datasets = {} - self.datasets = {} + self._datasets: dict[str, AbstractDataset] = {} + self.datasets: _FrozenDatasets | None = None - self.add_all(dict(datasets) or {}) + self.add_all(datasets or {}) self._load_versions = load_versions or {} self._save_version = save_version @@ -172,8 +174,9 @@ def __repr__(self) -> str: def __contains__(self, dataset_name: str) -> bool: """Check if an item is in the catalog as a materialised dataset or pattern""" - return dataset_name in self._datasets or self._config_resolver.match_pattern( - dataset_name + return ( + dataset_name in self._datasets + or self._config_resolver.match_pattern(dataset_name) is not None ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] @@ -267,6 +270,7 @@ class to be loaded is specified with the key ``type`` and their >>> df = catalog.load("cars") >>> catalog.save("boats", df) """ + catalog = catalog or {} datasets = {} config_resolver = DataCatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() @@ -276,7 +280,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name], + config_resolver.config[ds_name] or {}, load_versions.get(ds_name), save_version, ) @@ -315,7 +319,7 @@ def _get_dataset( if dataset_name not in self._datasets and ds_config is not None: ds = AbstractDataset.from_config( dataset_name, - ds_config, + ds_config, # type: ignore[arg-type] self._load_versions.get(dataset_name), self._save_version, ) @@ -571,7 +575,7 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non dataset = ( ds_data if isinstance(ds_data, AbstractDataset) - else MemoryDataset(data=ds_data) + else MemoryDataset(data=ds_data) # type: ignore[abstract] ) # type: ignore[abstract] self.add(ds_name, dataset, replace) diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index 62d7e1216b..e88d197b5b 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -106,7 +106,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """ Instantiates the runner by creating a Manager. @@ -125,7 +125,9 @@ def __init__( Raises: ValueError: bad parameters passed """ - default_dataset_pattern = {"{default}": {"type": "SharedMemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "SharedMemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 6f165e87c0..81436f7028 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -27,6 +27,7 @@ if TYPE_CHECKING: from pluggy import PluginManager + from kedro.io.catalog_config_resolver import Patterns from kedro.pipeline.node import Node @@ -38,7 +39,7 @@ class AbstractRunner(ABC): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: Patterns | None = None, ): """Instantiates the runner class. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 48dac3cd54..8b22ba89f5 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -27,7 +27,7 @@ class SequentialRunner(AbstractRunner): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """Instantiates the runner class. @@ -39,7 +39,9 @@ def __init__( for `SequentialRunner`. """ - default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "SharedMemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index b4751a602a..802f7f7de0 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -31,7 +31,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, ): """ Instantiates the runner. @@ -56,7 +56,9 @@ def __init__( "node inputs and outputs asynchronously with threads. " "Setting 'is_async' to False." ) - default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} + default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { + "{default}": {"type": "MemoryDataset"} + } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=False, extra_dataset_patterns=self._extra_dataset_patterns From 7be2a8e0d78016ec761cc7fe2406b0a49caa0cb4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 12:45:13 +0100 Subject: [PATCH 49/77] Fixed typo Signed-off-by: Elena Khaustova --- kedro/runner/sequential_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index 8b22ba89f5..d4dab2628a 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -40,7 +40,7 @@ def __init__( """ default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "SharedMemoryDataset"} + "{default}": {"type": "MemoryDataset"} } self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( From 9e43a9a7cf00baa81ecd8622d72c9ee4a97d539e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 13:56:17 +0100 Subject: [PATCH 50/77] Added module docstring Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 0aaae32af8..b9651157e2 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,3 +1,7 @@ +"""``DataCatalogConfigResolver`` resolves dataset configurations and datasets' +patterns based on catalog configuration and credentials provided. +""" + from __future__ import annotations import copy From 25b65019eee17db68c492decc6750596e5837f6c Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:49:41 +0100 Subject: [PATCH 51/77] Removed None from Pattern type Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 10 +++++----- kedro/io/data_catalog.py | 2 +- kedro/runner/parallel_runner.py | 6 ++---- kedro/runner/runner.py | 3 +-- kedro/runner/sequential_runner.py | 6 ++---- kedro/runner/thread_runner.py | 6 ++---- 6 files changed, 13 insertions(+), 20 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index b9651157e2..08cb26adcf 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -11,7 +11,7 @@ from parse import parse -Patterns = dict[str, dict[str, Any] | None] +Patterns = dict[str, dict[str, Any]] CREDENTIALS_KEY = "credentials" @@ -47,8 +47,8 @@ def _fetch_credentials( def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] | None -) -> dict[str, Any] | None: + config: dict[str, Any], credentials: dict[str, Any] +) -> dict[str, Any]: """Return the dataset configuration where credentials are resolved using credentials dictionary provided. @@ -110,7 +110,7 @@ def __init__( self._resolved_configs = self._init_configs(config, credentials) @property - def config(self) -> dict[str, dict[str, Any] | None]: + def config(self) -> dict[str, dict[str, Any]]: return self._resolved_configs @property @@ -206,7 +206,7 @@ def _init_configs( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, - ) -> dict[str, dict[str, Any] | None]: + ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" # TODO: check if deep copies are required config = copy.deepcopy(config) or {} diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index be488a71c4..7a54765740 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -576,7 +576,7 @@ def add_feed_dict(self, feed_dict: dict[str, Any], replace: bool = False) -> Non ds_data if isinstance(ds_data, AbstractDataset) else MemoryDataset(data=ds_data) # type: ignore[abstract] - ) # type: ignore[abstract] + ) self.add(ds_name, dataset, replace) def list(self, regex_search: str | None = None) -> list[str]: diff --git a/kedro/runner/parallel_runner.py b/kedro/runner/parallel_runner.py index e88d197b5b..62d7e1216b 100644 --- a/kedro/runner/parallel_runner.py +++ b/kedro/runner/parallel_runner.py @@ -106,7 +106,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """ Instantiates the runner by creating a Manager. @@ -125,9 +125,7 @@ def __init__( Raises: ValueError: bad parameters passed """ - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "SharedMemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "SharedMemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/runner.py b/kedro/runner/runner.py index 81436f7028..6f165e87c0 100644 --- a/kedro/runner/runner.py +++ b/kedro/runner/runner.py @@ -27,7 +27,6 @@ if TYPE_CHECKING: from pluggy import PluginManager - from kedro.io.catalog_config_resolver import Patterns from kedro.pipeline.node import Node @@ -39,7 +38,7 @@ class AbstractRunner(ABC): def __init__( self, is_async: bool = False, - extra_dataset_patterns: Patterns | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """Instantiates the runner class. diff --git a/kedro/runner/sequential_runner.py b/kedro/runner/sequential_runner.py index d4dab2628a..48dac3cd54 100644 --- a/kedro/runner/sequential_runner.py +++ b/kedro/runner/sequential_runner.py @@ -27,7 +27,7 @@ class SequentialRunner(AbstractRunner): def __init__( self, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """Instantiates the runner class. @@ -39,9 +39,7 @@ def __init__( for `SequentialRunner`. """ - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "MemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=is_async, extra_dataset_patterns=self._extra_dataset_patterns diff --git a/kedro/runner/thread_runner.py b/kedro/runner/thread_runner.py index 802f7f7de0..b4751a602a 100644 --- a/kedro/runner/thread_runner.py +++ b/kedro/runner/thread_runner.py @@ -31,7 +31,7 @@ def __init__( self, max_workers: int | None = None, is_async: bool = False, - extra_dataset_patterns: dict[str, dict[str, Any] | None] | None = None, + extra_dataset_patterns: dict[str, dict[str, Any]] | None = None, ): """ Instantiates the runner. @@ -56,9 +56,7 @@ def __init__( "node inputs and outputs asynchronously with threads. " "Setting 'is_async' to False." ) - default_dataset_pattern: dict[str, dict[str, Any] | None] | None = { - "{default}": {"type": "MemoryDataset"} - } + default_dataset_pattern = {"{default}": {"type": "MemoryDataset"}} self._extra_dataset_patterns = extra_dataset_patterns or default_dataset_pattern super().__init__( is_async=False, extra_dataset_patterns=self._extra_dataset_patterns From 3a646de40892fddcf04776a4421a75e078e49201 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:54:45 +0100 Subject: [PATCH 52/77] Fixed docs failing to find class reference Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 562f5a4b0e..4ac54eefbc 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -127,6 +127,7 @@ "typing.Type", "typing.Set", "kedro.config.config.ConfigLoader", + "kedro.io.catalog_config_resolver.DataCatalogConfigResolver", "kedro.io.core.AbstractDataset", "kedro.io.core.AbstractVersionedDataset", "kedro.io.core.DatasetError", From 5e5df4ab7a044c95094dd62b3d998b7b0b87b970 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 14:59:16 +0100 Subject: [PATCH 53/77] Fixed docs failing to find class reference Signed-off-by: Elena Khaustova --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4ac54eefbc..635a5220a0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -169,6 +169,7 @@ "D[k] if k in D, else d. d defaults to None.", "None. Update D from mapping/iterable E and F.", "Patterns", + "DataCatalogConfigResolver", ), "py:data": ( "typing.Any", From aa59a35b16520e99d069067672d909c9df4eccca Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 15:17:17 +0100 Subject: [PATCH 54/77] Updated Patterns type Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 08cb26adcf..f9d20294fd 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -7,11 +7,11 @@ import copy import logging import re -from typing import Any +from typing import Any, Dict from parse import parse -Patterns = dict[str, dict[str, Any]] +Patterns = Dict[str, Dict[str, Any]] CREDENTIALS_KEY = "credentials" From c7efa3e5954fbe6d518426cd9392a47b60b78ac9 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Fri, 6 Sep 2024 16:35:09 +0100 Subject: [PATCH 55/77] Fix tests (#4149) * Fix most tests Signed-off-by: Ankita Katiyar * Fix most tests Signed-off-by: Ankita Katiyar --------- Signed-off-by: Ankita Katiyar --- kedro/io/catalog_config_resolver.py | 8 ++++++++ kedro/io/data_catalog.py | 6 +++++- tests/framework/cli/test_catalog.py | 3 +-- tests/framework/session/test_session.py | 2 +- tests/io/test_data_catalog.py | 26 ++++++++++++------------- 5 files changed, 27 insertions(+), 18 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index f9d20294fd..d771363c90 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -11,6 +11,8 @@ from parse import parse +from kedro.io.core import DatasetError + Patterns = Dict[str, Dict[str, Any]] CREDENTIALS_KEY = "credentials" @@ -214,6 +216,12 @@ def _init_configs( resolved_configs = {} for ds_name, ds_config in config.items(): + if not isinstance(ds_config, dict): + raise DatasetError( + f"Catalog entry '{ds_name}' is not a valid dataset configuration. " + "\nHint: If this catalog entry is intended for variable interpolation, " + "make sure that the key is preceded by an underscore." + ) if not self.is_pattern(ds_name): resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7a54765740..9c031c56ac 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -14,7 +14,11 @@ import re from typing import Any -from kedro.io.catalog_config_resolver import DataCatalogConfigResolver, Patterns +from kedro.io.catalog_config_resolver import ( + CREDENTIALS_KEY, # noqa: F401 + DataCatalogConfigResolver, + Patterns, +) from kedro.io.core import ( AbstractDataset, AbstractVersionedDataset, diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index f34034296e..b898b270f9 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -490,7 +490,6 @@ def test_rank_catalog_factories( mocked_context.catalog = DataCatalog.from_config( fake_catalog_with_overlapping_factories ) - print("!!!!", mocked_context.catalog._dataset_patterns) result = CliRunner().invoke( fake_project_cli, ["catalog", "rank"], obj=fake_metadata ) @@ -547,7 +546,7 @@ def test_catalog_resolve( mocked_context.catalog = DataCatalog.from_config( catalog=fake_catalog_config, credentials=fake_credentials_config ) - placeholder_ds = mocked_context.catalog._dataset_patterns.keys() + placeholder_ds = mocked_context.catalog.config_resolver.dataset_patterns.keys() pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} mocker.patch.object( diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index bc25db37c7..71c2fbb2f5 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -730,7 +730,7 @@ def test_run_thread_runner( } mocker.patch("kedro.framework.session.session.pipelines", pipelines_ret) mocker.patch( - "kedro.io.data_catalog.DataCatalog._match_pattern", + "kedro.io.data_catalog.DataCatalogConfigResolver.match_pattern", return_value=match_pattern, ) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index dbec57e64d..0e3e44a6b5 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -846,7 +846,7 @@ def test_match_added_to_datasets_on_get(self, config_with_dataset_factories): catalog = DataCatalog.from_config(**config_with_dataset_factories) assert "{brand}_cars" not in catalog._datasets assert "tesla_cars" not in catalog._datasets - assert "{brand}_cars" in catalog._dataset_patterns + assert "{brand}_cars" in catalog.config_resolver._dataset_patterns tesla_cars = catalog._get_dataset("tesla_cars") assert isinstance(tesla_cars, CSVDataset) @@ -875,8 +875,8 @@ def test_patterns_not_in_catalog_datasets(self, config_with_dataset_factories): catalog = DataCatalog.from_config(**config_with_dataset_factories) assert "audi_cars" in catalog._datasets assert "{brand}_cars" not in catalog._datasets - assert "audi_cars" not in catalog._dataset_patterns - assert "{brand}_cars" in catalog._dataset_patterns + assert "audi_cars" not in catalog.config_resolver._dataset_patterns + assert "{brand}_cars" in catalog.config_resolver._dataset_patterns def test_explicit_entry_not_overwritten(self, config_with_dataset_factories): """Check that the existing catalog entry is not overwritten by config in pattern""" @@ -909,11 +909,7 @@ def test_sorting_order_patterns(self, config_with_dataset_factories_only_pattern "{dataset}s", "{user_default}", ] - assert ( - list(catalog._dataset_patterns.keys()) - + list(catalog._default_pattern.keys()) - == sorted_keys_expected - ) + assert catalog.config_resolver.list_patterns() == sorted_keys_expected def test_multiple_catch_all_patterns_not_allowed( self, config_with_dataset_factories @@ -929,7 +925,7 @@ def test_multiple_catch_all_patterns_not_allowed( } with pytest.raises( - DatasetError, match="Multiple catch-all patterns found in the catalog" + ValueError, match="Multiple catch-all patterns found in the catalog" ): DataCatalog.from_config(**config_with_dataset_factories) @@ -959,7 +955,7 @@ def test_sorting_order_with_other_dataset_through_extra_pattern( "{default}", ] assert ( - list(catalog_with_default._dataset_patterns.keys()) == sorted_keys_expected + catalog_with_default.config_resolver.list_patterns() == sorted_keys_expected ) def test_user_default_overwrites_runner_default(self): @@ -989,10 +985,12 @@ def test_user_default_overwrites_runner_default(self): "{dataset}s", "{a_default}", ] - assert "{a_default}" in catalog_with_runner_default._default_pattern assert ( - list(catalog_with_runner_default._dataset_patterns.keys()) - + list(catalog_with_runner_default._default_pattern.keys()) + "{a_default}" in catalog_with_runner_default.config_resolver.default_pattern + ) + assert ( + list(catalog_with_runner_default.config_resolver.dataset_patterns.keys()) + + list(catalog_with_runner_default.config_resolver.default_pattern.keys()) == sorted_keys_expected ) @@ -1019,7 +1017,7 @@ def test_unmatched_key_error_when_parsing_config( "Unable to resolve 'data/01_raw/{brand}_plane.pq' from the pattern '{type}@planes'. " "Keys used in the configuration should be present in the dataset factory pattern." ) - with pytest.raises(DatasetError, match=re.escape(pattern)): + with pytest.raises(KeyError, match=re.escape(pattern)): catalog._get_dataset("jet@planes") def test_factory_config_versioned( From 023ffc629924c7b318a020373c04a8f933a9ad23 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 16:39:57 +0100 Subject: [PATCH 56/77] Returned constants to avoid breaking changes Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 9c031c56ac..f5597c6cc0 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -31,6 +31,7 @@ from kedro.io.memory_dataset import MemoryDataset from kedro.utils import _format_rich, _has_rich_handler +CATALOG_KEY = "catalog" # Kept to avoid the breaking change WORDS_REGEX_PATTERN = re.compile(r"\W+") From 585b44f7315854a4dad2c7a271cf872d0ec37231 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Fri, 6 Sep 2024 18:26:09 +0100 Subject: [PATCH 57/77] Minor fix Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f5597c6cc0..778fbb3bb4 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name] or {}, + config_resolver.config[ds_name], load_versions.get(ds_name), save_version, ) From e447078908690503026a0cc98546dff29d6f9649 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:12:57 +0100 Subject: [PATCH 58/77] Updated test_sorting_order_with_other_dataset_through_extra_pattern Signed-off-by: Elena Khaustova --- tests/io/test_data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 0e3e44a6b5..9a118d8110 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -949,9 +949,9 @@ def test_sorting_order_with_other_dataset_through_extra_pattern( ) sorted_keys_expected = [ "{country}_companies", - "{another}#csv", "{namespace}_{dataset}", "{dataset}s", + "{another}#csv", "{default}", ] assert ( From 975e9685c937755511d2d7ab17b42cf0100e328e Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:52:07 +0100 Subject: [PATCH 59/77] Removed odd properties Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 7 +------ kedro/io/catalog_config_resolver.py | 8 -------- kedro/io/data_catalog.py | 12 ++++++------ 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 37ffebd13c..f6a58664fb 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -214,12 +214,7 @@ def rank_catalog_factories(metadata: ProjectMetadata, env: str) -> None: session = _create_session(metadata.package_name, env=env) context = session.load_context() - catalog_factories = list( - { - **context.catalog.config_resolver.dataset_patterns, - **context.catalog.config_resolver.default_pattern, - }.keys() - ) + catalog_factories = context.catalog.config_resolver.list_patterns() if catalog_factories: click.echo(yaml.dump(catalog_factories)) else: diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index d771363c90..8fc0c82aa6 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -115,14 +115,6 @@ def __init__( def config(self) -> dict[str, dict[str, Any]]: return self._resolved_configs - @property - def dataset_patterns(self) -> Patterns: - return self._dataset_patterns - - @property - def default_pattern(self) -> Patterns: - return self._default_pattern - @property def _logger(self) -> logging.Logger: return logging.getLogger(__name__) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 778fbb3bb4..7cd8c31690 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -185,9 +185,9 @@ def __contains__(self, dataset_name: str) -> bool: ) def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] - return (self._datasets, self._config_resolver.dataset_patterns) == ( + return (self._datasets, self._config_resolver.list_patterns()) == ( other._datasets, - other._config_resolver.dataset_patterns, + other.config_resolver.list_patterns(), ) @property @@ -306,8 +306,8 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, - dataset_patterns=config_resolver.dataset_patterns, - default_pattern=config_resolver.default_pattern, + dataset_patterns=config_resolver._dataset_patterns, + default_pattern=config_resolver._default_pattern, load_versions=load_versions, save_version=save_version, config_resolver=config_resolver, @@ -641,8 +641,8 @@ def shallow_copy( self._config_resolver.add_runtime_patterns(extra_dataset_patterns) return self.__class__( datasets=self._datasets, - dataset_patterns=self._config_resolver.dataset_patterns, - default_pattern=self._config_resolver.default_pattern, + dataset_patterns=self._config_resolver._dataset_patterns, + default_pattern=self._config_resolver._default_pattern, load_versions=self._load_versions, save_version=self._save_version, config_resolver=self._config_resolver, From 11d782cf71f5ddc8bd2689de73e5cc7fd75afbd8 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 11:58:55 +0100 Subject: [PATCH 60/77] Updated tests Signed-off-by: Elena Khaustova --- tests/framework/cli/test_catalog.py | 2 +- tests/io/test_data_catalog.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py index b898b270f9..7a61c9e7a0 100644 --- a/tests/framework/cli/test_catalog.py +++ b/tests/framework/cli/test_catalog.py @@ -546,7 +546,7 @@ def test_catalog_resolve( mocked_context.catalog = DataCatalog.from_config( catalog=fake_catalog_config, credentials=fake_credentials_config ) - placeholder_ds = mocked_context.catalog.config_resolver.dataset_patterns.keys() + placeholder_ds = mocked_context.catalog.config_resolver.list_patterns() pipeline_datasets = {"csv_example", "parquet_example", "explicit_dataset"} mocker.patch.object( diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 9a118d8110..be8ed0831e 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -984,13 +984,15 @@ def test_user_default_overwrites_runner_default(self): sorted_keys_expected = [ "{dataset}s", "{a_default}", + "{another}#csv", + "{default}", ] assert ( - "{a_default}" in catalog_with_runner_default.config_resolver.default_pattern + "{a_default}" + in catalog_with_runner_default.config_resolver._default_pattern ) assert ( - list(catalog_with_runner_default.config_resolver.dataset_patterns.keys()) - + list(catalog_with_runner_default.config_resolver.default_pattern.keys()) + catalog_with_runner_default.config_resolver.list_patterns() == sorted_keys_expected ) From e4abd2311f25534cf936635848f3e5d50883ed5f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Mon, 9 Sep 2024 12:57:53 +0100 Subject: [PATCH 61/77] Removed None from _fetch_credentials input Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 8fc0c82aa6..73aeb4a830 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -18,9 +18,7 @@ CREDENTIALS_KEY = "credentials" -def _fetch_credentials( - credentials_name: str, credentials: dict[str, Any] | None -) -> Any: +def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: """Fetch the specified credentials from the provided credentials dictionary. Args: @@ -35,8 +33,6 @@ def _fetch_credentials( registered. """ - if credentials is None: - return None try: return credentials[credentials_name] except KeyError as exc: From 6433dd8dc47e43a19181dfbfcf7d861dab51efaf Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 14:59:19 +0100 Subject: [PATCH 62/77] Renamed DataCatalogConfigResolver to CatalogConfigResolver Signed-off-by: Elena Khaustova --- docs/source/conf.py | 4 ++-- kedro/io/__init__.py | 4 ++-- kedro/io/catalog_config_resolver.py | 4 ++-- kedro/io/data_catalog.py | 12 ++++++------ tests/framework/session/test_session.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 635a5220a0..2c3a2c4c00 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -127,7 +127,7 @@ "typing.Type", "typing.Set", "kedro.config.config.ConfigLoader", - "kedro.io.catalog_config_resolver.DataCatalogConfigResolver", + "kedro.io.catalog_config_resolver.CatalogConfigResolver", "kedro.io.core.AbstractDataset", "kedro.io.core.AbstractVersionedDataset", "kedro.io.core.DatasetError", @@ -169,7 +169,7 @@ "D[k] if k in D, else d. d defaults to None.", "None. Update D from mapping/iterable E and F.", "Patterns", - "DataCatalogConfigResolver", + "CatalogConfigResolver", ), "py:data": ( "typing.Any", diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index 5d17d6f058..4b4a2e1b52 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations from .cached_dataset import CachedDataset -from .catalog_config_resolver import DataCatalogConfigResolver +from .catalog_config_resolver import CatalogConfigResolver from .core import ( AbstractDataset, AbstractVersionedDataset, @@ -24,7 +24,7 @@ "AbstractVersionedDataset", "CachedDataset", "DataCatalog", - "DataCatalogConfigResolver", + "CatalogConfigResolver", "DatasetAlreadyExistsError", "DatasetError", "DatasetNotFoundError", diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 73aeb4a830..d2e6a14a4b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -1,4 +1,4 @@ -"""``DataCatalogConfigResolver`` resolves dataset configurations and datasets' +"""``CatalogConfigResolver`` resolves dataset configurations and datasets' patterns based on catalog configuration and credentials provided. """ @@ -93,7 +93,7 @@ def _resolve_dataset_config( return config -class DataCatalogConfigResolver: +class CatalogConfigResolver: """Resolves dataset configurations based on patterns and credentials.""" def __init__( diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 7cd8c31690..f53a3ea2b9 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -16,7 +16,7 @@ from kedro.io.catalog_config_resolver import ( CREDENTIALS_KEY, # noqa: F401 - DataCatalogConfigResolver, + CatalogConfigResolver, Patterns, ) from kedro.io.core import ( @@ -116,7 +116,7 @@ def __init__( # noqa: PLR0913 default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, - config_resolver: DataCatalogConfigResolver | None = None, + config_resolver: CatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` implementations to provide ``load`` and ``save`` capabilities from @@ -147,7 +147,7 @@ def __init__( # noqa: PLR0913 sorted in lexicographical order. default_pattern: A dictionary of the default catch-all pattern that overrides the default pattern provided through the runners. - config_resolver: An instance of DataCatalogConfigResolver to resolve dataset patterns and configurations. + config_resolver: An instance of CatalogConfigResolver to resolve dataset patterns and configurations. Example: @@ -160,7 +160,7 @@ def __init__( # noqa: PLR0913 >>> save_args={"index": False}) >>> catalog = DataCatalog(datasets={'cars': cars}) """ - self._config_resolver = config_resolver or DataCatalogConfigResolver() + self._config_resolver = config_resolver or CatalogConfigResolver() self._datasets: dict[str, AbstractDataset] = {} self.datasets: _FrozenDatasets | None = None @@ -191,7 +191,7 @@ def __eq__(self, other) -> bool: # type: ignore[no-untyped-def] ) @property - def config_resolver(self) -> DataCatalogConfigResolver: + def config_resolver(self) -> CatalogConfigResolver: return self._config_resolver @property @@ -277,7 +277,7 @@ class to be loaded is specified with the key ``type`` and their """ catalog = catalog or {} datasets = {} - config_resolver = DataCatalogConfigResolver(catalog, credentials) + config_resolver = CatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() load_versions = copy.deepcopy(load_versions) or {} diff --git a/tests/framework/session/test_session.py b/tests/framework/session/test_session.py index 3e2deb38ea..086d581045 100644 --- a/tests/framework/session/test_session.py +++ b/tests/framework/session/test_session.py @@ -693,7 +693,7 @@ def test_run_thread_runner( } mocker.patch("kedro.framework.session.session.pipelines", pipelines_ret) mocker.patch( - "kedro.io.data_catalog.DataCatalogConfigResolver.match_pattern", + "kedro.io.data_catalog.CatalogConfigResolver.match_pattern", return_value=match_pattern, ) From 355576f70b8f825e1ade838cc0aaf236fe8dcc29 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 15:40:39 +0100 Subject: [PATCH 63/77] Renamed _init_configs to _resolve_config_credentials Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index d2e6a14a4b..ed539baad9 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -105,7 +105,7 @@ def __init__( self._dataset_patterns, self._default_pattern = self._extract_patterns( config, credentials ) - self._resolved_configs = self._init_configs(config, credentials) + self._resolved_configs = self._resolve_config_credentials(config, credentials) @property def config(self) -> dict[str, dict[str, Any]]: @@ -192,7 +192,7 @@ def _extract_patterns( return sorted_patterns, user_default - def _init_configs( + def _resolve_config_credentials( self, config: dict[str, dict[str, Any]] | None, credentials: dict[str, dict[str, Any]] | None, From 39d9ff681413a8edbf79dcd41fb46e1207dbaa89 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 16:02:40 +0100 Subject: [PATCH 64/77] Moved functions to the class Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 163 +++++++++++++++------------- 1 file changed, 85 insertions(+), 78 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index ed539baad9..a2d08f5f43 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -18,81 +18,6 @@ CREDENTIALS_KEY = "credentials" -def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: - """Fetch the specified credentials from the provided credentials dictionary. - - Args: - credentials_name: Credentials name. - credentials: A dictionary with all credentials. - - Returns: - The set of requested credentials. - - Raises: - KeyError: When a data set with the given name has not yet been - registered. - - """ - try: - return credentials[credentials_name] - except KeyError as exc: - raise KeyError( - f"Unable to find credentials '{credentials_name}': check your data " - "catalog and credentials configuration. See " - "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " - "for an example." - ) from exc - - -def _resolve_credentials( - config: dict[str, Any], credentials: dict[str, Any] -) -> dict[str, Any]: - """Return the dataset configuration where credentials are resolved using - credentials dictionary provided. - - Args: - config: Original dataset config, which may contain unresolved credentials. - credentials: A dictionary with all credentials. - - Returns: - The dataset config, where all the credentials are successfully resolved. - """ - config = copy.deepcopy(config) - - def _resolve_value(key: str, value: Any) -> Any: - if key == CREDENTIALS_KEY and isinstance(value, str): - return _fetch_credentials(value, credentials) - if isinstance(value, dict): - return {k: _resolve_value(k, v) for k, v in value.items()} - return value - - return {k: _resolve_value(k, v) for k, v in config.items()} - - -def _resolve_dataset_config( - ds_name: str, - pattern: str, - config: Any, -) -> Any: - """Resolve dataset configuration based on the provided pattern.""" - resolved_vars = parse(pattern, ds_name) - # Resolve the factory config for the dataset - if isinstance(config, dict): - for key, value in config.items(): - config[key] = _resolve_dataset_config(ds_name, pattern, value) - elif isinstance(config, (list, tuple)): - config = [_resolve_dataset_config(ds_name, pattern, value) for value in config] - elif isinstance(config, str) and "}" in config: - try: - config = config.format_map(resolved_vars.named) - except KeyError as exc: - raise KeyError( - f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " - f"should be present in the dataset factory pattern." - ) from exc - return config - - class CatalogConfigResolver: """Resolves dataset configurations based on patterns and credentials.""" @@ -153,6 +78,84 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: ) return {key: dataset_patterns[key] for key in sorted_keys} + @staticmethod + def _fetch_credentials(credentials_name: str, credentials: dict[str, Any]) -> Any: + """Fetch the specified credentials from the provided credentials dictionary. + + Args: + credentials_name: Credentials name. + credentials: A dictionary with all credentials. + + Returns: + The set of requested credentials. + + Raises: + KeyError: When a data set with the given name has not yet been + registered. + + """ + try: + return credentials[credentials_name] + except KeyError as exc: + raise KeyError( + f"Unable to find credentials '{credentials_name}': check your data " + "catalog and credentials configuration. See " + "https://kedro.readthedocs.io/en/stable/kedro.io.DataCatalog.html " + "for an example." + ) from exc + + @classmethod + def _resolve_credentials( + cls, config: dict[str, Any], credentials: dict[str, Any] + ) -> dict[str, Any]: + """Return the dataset configuration where credentials are resolved using + credentials dictionary provided. + + Args: + config: Original dataset config, which may contain unresolved credentials. + credentials: A dictionary with all credentials. + + Returns: + The dataset config, where all the credentials are successfully resolved. + """ + config = copy.deepcopy(config) + + def _resolve_value(key: str, value: Any) -> Any: + if key == CREDENTIALS_KEY and isinstance(value, str): + return cls._fetch_credentials(value, credentials) + if isinstance(value, dict): + return {k: _resolve_value(k, v) for k, v in value.items()} + return value + + return {k: _resolve_value(k, v) for k, v in config.items()} + + @classmethod + def _resolve_dataset_config( + cls, + ds_name: str, + pattern: str, + config: Any, + ) -> Any: + """Resolve dataset configuration based on the provided pattern.""" + resolved_vars = parse(pattern, ds_name) + # Resolve the factory config for the dataset + if isinstance(config, dict): + for key, value in config.items(): + config[key] = cls._resolve_dataset_config(ds_name, pattern, value) + elif isinstance(config, (list, tuple)): + config = [ + cls._resolve_dataset_config(ds_name, pattern, value) for value in config + ] + elif isinstance(config, str) and "}" in config: + try: + config = config.format_map(resolved_vars.named) + except KeyError as exc: + raise KeyError( + f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " + f"should be present in the dataset factory pattern." + ) from exc + return config + def list_patterns(self) -> list[str]: """List al patterns available in the catalog.""" return ( @@ -181,7 +184,9 @@ def _extract_patterns( for ds_name, ds_config in config.items(): if cls.is_pattern(ds_name): - dataset_patterns[ds_name] = _resolve_credentials(ds_config, credentials) + dataset_patterns[ds_name] = cls._resolve_credentials( + ds_config, credentials + ) sorted_patterns = cls._sort_patterns(dataset_patterns) if sorted_patterns: @@ -211,7 +216,9 @@ def _resolve_config_credentials( "make sure that the key is preceded by an underscore." ) if not self.is_pattern(ds_name): - resolved_configs[ds_name] = _resolve_credentials(ds_config, credentials) + resolved_configs[ds_name] = self._resolve_credentials( + ds_config, credentials + ) return resolved_configs @@ -233,7 +240,7 @@ def resolve_dataset_pattern( or self._runtime_patterns.get(matched_pattern) or {} ) - ds_config = _resolve_dataset_config( + ds_config = self._resolve_dataset_config( ds_name, matched_pattern, config_copy ) From 659c9daedf88d3993816c391f027cf5cd19ed1a4 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 16:16:57 +0100 Subject: [PATCH 65/77] Refactored resolve_dataset_pattern Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index a2d08f5f43..99274095f1 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -170,6 +170,14 @@ def match_pattern(self, ds_name: str) -> str | None: matches = (pattern for pattern in all_patterns if parse(pattern, ds_name)) return next(matches, None) + def _get_pattern_config(self, pattern: str) -> dict[str, Any]: + return ( + self._dataset_patterns.get(pattern) + or self._default_pattern.get(pattern) + or self._runtime_patterns.get(pattern) + or {} + ) + @classmethod def _extract_patterns( cls, @@ -232,16 +240,9 @@ def resolve_dataset_pattern( for ds_name in datasets_lst: matched_pattern = self.match_pattern(ds_name) if matched_pattern and ds_name not in self._resolved_configs: - # If the dataset is a patterned dataset, materialise it and add it to - # the catalog - config_copy = copy.deepcopy( - self._dataset_patterns.get(matched_pattern) - or self._default_pattern.get(matched_pattern) - or self._runtime_patterns.get(matched_pattern) - or {} - ) + pattern_config = self._get_pattern_config(matched_pattern) ds_config = self._resolve_dataset_config( - ds_name, matched_pattern, config_copy + ds_name, matched_pattern, copy.deepcopy(pattern_config) ) if ( @@ -255,10 +256,8 @@ def resolve_dataset_pattern( ds_name, ) resolved_configs.append(ds_config) - elif ds_name in self._resolved_configs: - resolved_configs.append(self._resolved_configs.get(ds_name)) - else: - resolved_configs.append(None) + + resolved_configs.append(self._resolved_configs.get(ds_name, None)) return resolved_configs[0] if isinstance(datasets, str) else resolved_configs From 840b32a7d760bc69393b07921f59e3c7cad9c492 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:27:00 +0100 Subject: [PATCH 66/77] Fixed refactored part Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 99274095f1..a17b4725a5 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -256,8 +256,8 @@ def resolve_dataset_pattern( ds_name, ) resolved_configs.append(ds_config) - - resolved_configs.append(self._resolved_configs.get(ds_name, None)) + else: + resolved_configs.append(self._resolved_configs.get(ds_name, None)) return resolved_configs[0] if isinstance(datasets, str) else resolved_configs From 77f551c3c4a3a3e3f099f82e519a6c8efc372c8d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:27:34 +0100 Subject: [PATCH 67/77] Changed the order of arguments for DataCatalog constructor Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f53a3ea2b9..a195c9e47a 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -113,9 +113,9 @@ def __init__( # noqa: PLR0913 datasets: dict[str, AbstractDataset] | None = None, feed_dict: dict[str, Any] | None = None, dataset_patterns: Patterns | None = None, # Kept for interface compatibility - default_pattern: Patterns | None = None, # Kept for interface compatibility load_versions: dict[str, str] | None = None, save_version: str | None = None, + default_pattern: Patterns | None = None, # Kept for interface compatibility config_resolver: CatalogConfigResolver | None = None, ) -> None: """``DataCatalog`` stores instances of ``AbstractDataset`` @@ -307,9 +307,9 @@ class to be loaded is specified with the key ``type`` and their return cls( datasets=datasets, dataset_patterns=config_resolver._dataset_patterns, - default_pattern=config_resolver._default_pattern, load_versions=load_versions, save_version=save_version, + default_pattern=config_resolver._default_pattern, config_resolver=config_resolver, ) From 6e079a1194f9a6800ba3c59f59bdebea6b865d9f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:29:32 +0100 Subject: [PATCH 68/77] Replaced __getitem__ with .get() Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index a195c9e47a..97111e22a9 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -122,7 +122,7 @@ def __init__( # noqa: PLR0913 implementations to provide ``load`` and ``save`` capabilities from anywhere in the program. To use a ``DataCatalog``, you need to instantiate it with a dictionary of data sets. Then it will act as a - single point of reference for your calls, relaying load and saveĀ§ + single point of reference for your calls, relaying load and save functions to the underlying data sets. Args: @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config[ds_name], + config_resolver.config.get(ds_name), load_versions.get(ds_name), save_version, ) From 1f7e5f88476a6bd7c8cd92bea151f945f4e2c797 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:39:45 +0100 Subject: [PATCH 69/77] Updated catalog commands Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index f6a58664fb..4001b696f3 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -3,7 +3,7 @@ from __future__ import annotations from collections import defaultdict -from itertools import chain +from itertools import chain, filterfalse from typing import TYPE_CHECKING, Any import click @@ -126,11 +126,10 @@ def _map_type_to_datasets( datasets of the specific type as a value. """ mapping = defaultdict(list) # type: ignore[var-annotated] - for dataset_name in datasets: - if not is_parameter(dataset_name): - ds_type = datasets_meta[dataset_name].__class__.__name__ - if dataset_name not in mapping[ds_type]: - mapping[ds_type].append(dataset_name) + for dataset_name in filterfalse(is_parameter, datasets): + ds_type = datasets_meta[dataset_name].__class__.__name__ + if dataset_name not in mapping[ds_type]: + mapping[ds_type].append(dataset_name) return mapping @@ -167,13 +166,9 @@ def create_catalog(metadata: ProjectMetadata, pipeline_name: str, env: str) -> N f"'{pipeline_name}' pipeline not found! Existing pipelines: {existing_pipelines}" ) - pipeline_datasets = { - ds_name for ds_name in pipeline.datasets() if not is_parameter(ds_name) - } + pipeline_datasets = set(filterfalse(is_parameter, pipeline.datasets())) - catalog_datasets = { - ds_name for ds_name in context.catalog.list() if not is_parameter(ds_name) - } + catalog_datasets = set(filterfalse(is_parameter, context.catalog.list())) # Datasets that are missing in Data Catalog missing_ds = sorted(pipeline_datasets - catalog_datasets) From 80f0e3d938201527a73e7fed2dbf599148afffb7 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:44:48 +0100 Subject: [PATCH 70/77] Moved warm up block outside of the try block Signed-off-by: Elena Khaustova --- kedro/framework/session/session.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/framework/session/session.py b/kedro/framework/session/session.py index 2b13cd1694..caa3553954 100644 --- a/kedro/framework/session/session.py +++ b/kedro/framework/session/session.py @@ -394,11 +394,11 @@ def run( # noqa: PLR0913 run_params=record_data, pipeline=filtered_pipeline, catalog=catalog ) + if isinstance(runner, ThreadRunner): + for ds in filtered_pipeline.datasets(): + if catalog.config_resolver.match_pattern(ds): + _ = catalog._get_dataset(ds) try: - if isinstance(runner, ThreadRunner): - for ds in filtered_pipeline.datasets(): - if catalog.config_resolver.match_pattern(ds): - _ = catalog._get_dataset(ds) run_result = runner.run( filtered_pipeline, catalog, hook_manager, session_id ) From 017cda3ecc562cf21b9f94fd8b008e3eea73da79 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 17:59:51 +0100 Subject: [PATCH 71/77] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 97111e22a9..f025c52190 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -285,7 +285,7 @@ class to be loaded is specified with the key ``type`` and their if not config_resolver.is_pattern(ds_name): datasets[ds_name] = AbstractDataset.from_config( ds_name, - config_resolver.config.get(ds_name), + config_resolver.config.get(ds_name, {}), load_versions.get(ds_name), save_version, ) From cab6f06e2fc33cdd3a030ff17f80b887e6f1d72a Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Tue, 10 Sep 2024 18:06:52 +0100 Subject: [PATCH 72/77] Removed odd copying Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 9 ++++----- kedro/io/data_catalog.py | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index a17b4725a5..f3548e4dd5 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -185,8 +185,8 @@ def _extract_patterns( credentials: dict[str, dict[str, Any]] | None, ) -> tuple[Patterns, Patterns]: """Extract and sort patterns from the configuration.""" - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} + config = config or {} + credentials = credentials or {} dataset_patterns = {} user_default = {} @@ -211,9 +211,8 @@ def _resolve_config_credentials( credentials: dict[str, dict[str, Any]] | None, ) -> dict[str, dict[str, Any]]: """Initialize the dataset configuration with resolved credentials.""" - # TODO: check if deep copies are required - config = copy.deepcopy(config) or {} - credentials = copy.deepcopy(credentials) or {} + config = config or {} + credentials = credentials or {} resolved_configs = {} for ds_name, ds_config in config.items(): diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f025c52190..2b09c35e80 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -7,7 +7,6 @@ from __future__ import annotations -import copy import difflib import logging import pprint @@ -279,7 +278,7 @@ class to be loaded is specified with the key ``type`` and their datasets = {} config_resolver = CatalogConfigResolver(catalog, credentials) save_version = save_version or generate_timestamp() - load_versions = copy.deepcopy(load_versions) or {} + load_versions = load_versions or {} for ds_name in catalog: if not config_resolver.is_pattern(ds_name): From 8f604d1da1428210b1792585bb1ab57a5de9b5b6 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 10:47:44 +0100 Subject: [PATCH 73/77] Updated release notes Signed-off-by: Elena Khaustova --- RELEASE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 34e75ffb74..548b49a109 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,8 @@ # Upcoming Release ## Major features and improvements +* Refactored `kedro run` and `kedro catalog` commands. +* Moved pattern resolution logic from `DataCatalog` to a separate component - `CatalogConfigResolver`. Updated `DataCatalog` to use `CatalogConfigResolver` internally. * Made packaged Kedro projects return `session.run()` output to be used when running it in the interactive environment. * Enhanced `OmegaConfigLoader` configuration validation to detect duplicate keys at all parameter levels, ensuring comprehensive nested key checking. ## Bug fixes and other changes From 9a4db1858473294c6439687359e5368d06a1a2e1 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 13:53:06 +0100 Subject: [PATCH 74/77] Returned DatasetError Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 4 ++-- tests/io/test_data_catalog.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index f3548e4dd5..ab679f1e4b 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -73,7 +73,7 @@ def _sort_patterns(cls, dataset_patterns: Patterns) -> Patterns: pattern for pattern in sorted_keys if cls._pattern_specificity(pattern) == 0 ] if len(catch_all) > 1: - raise ValueError( + raise DatasetError( f"Multiple catch-all patterns found in the catalog: {', '.join(catch_all)}. Only one catch-all pattern is allowed, remove the extras." ) return {key: dataset_patterns[key] for key in sorted_keys} @@ -150,7 +150,7 @@ def _resolve_dataset_config( try: config = config.format_map(resolved_vars.named) except KeyError as exc: - raise KeyError( + raise DatasetError( f"Unable to resolve '{config}' from the pattern '{pattern}'. Keys used in the configuration " f"should be present in the dataset factory pattern." ) from exc diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index be8ed0831e..db777cc634 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -925,7 +925,7 @@ def test_multiple_catch_all_patterns_not_allowed( } with pytest.raises( - ValueError, match="Multiple catch-all patterns found in the catalog" + DatasetError, match="Multiple catch-all patterns found in the catalog" ): DataCatalog.from_config(**config_with_dataset_factories) @@ -1019,7 +1019,7 @@ def test_unmatched_key_error_when_parsing_config( "Unable to resolve 'data/01_raw/{brand}_plane.pq' from the pattern '{type}@planes'. " "Keys used in the configuration should be present in the dataset factory pattern." ) - with pytest.raises(KeyError, match=re.escape(pattern)): + with pytest.raises(DatasetError, match=re.escape(pattern)): catalog._get_dataset("jet@planes") def test_factory_config_versioned( From 0a6946ab4032ffb51c74c93e1bd35784cecddbb9 Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 14:13:05 +0100 Subject: [PATCH 75/77] Added _dataset_patterns and _default_pattern to _config_resolver to avoid breaking change Signed-off-by: Elena Khaustova --- kedro/io/data_catalog.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 2b09c35e80..475c18a148 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -160,6 +160,12 @@ def __init__( # noqa: PLR0913 >>> catalog = DataCatalog(datasets={'cars': cars}) """ self._config_resolver = config_resolver or CatalogConfigResolver() + + # Kept to avoid breaking changes + if not config_resolver: + self._config_resolver._dataset_patterns = dataset_patterns or {} + self._config_resolver._default_pattern = default_pattern or {} + self._datasets: dict[str, AbstractDataset] = {} self.datasets: _FrozenDatasets | None = None From fee7bd6d662045572d09a98a3038d3e39671ec6d Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 15:26:12 +0100 Subject: [PATCH 76/77] Made resolve_dataset_pattern return just dict Signed-off-by: Elena Khaustova --- kedro/framework/cli/catalog.py | 12 ++++---- kedro/io/catalog_config_resolver.py | 47 ++++++++++++----------------- kedro/io/data_catalog.py | 4 +-- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 4001b696f3..7bd0197e5b 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -93,12 +93,12 @@ def list_datasets(metadata: ProjectMetadata, pipeline: str, env: str) -> None: # resolve any factory datasets in the pipeline factory_ds_by_type = defaultdict(list) - resolved_configs = data_catalog.config_resolver.resolve_dataset_pattern( - default_ds - ) - for ds_name, ds_config in zip(default_ds, resolved_configs): + for ds_name in default_ds: if data_catalog.config_resolver.match_pattern(ds_name): - factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( # type: ignore[attr-defined] + ds_config = data_catalog.config_resolver.resolve_dataset_pattern( + ds_name + ) + factory_ds_by_type[ds_config.get("type", "DefaultDataset")].append( ds_name ) @@ -253,7 +253,7 @@ def resolve_patterns(metadata: ProjectMetadata, env: str) -> None: ds_config = data_catalog.config_resolver.resolve_dataset_pattern(ds_name) # Exclude MemoryDatasets not set in the catalog explicitly - if ds_config is not None: + if ds_config: explicit_datasets[ds_name] = ds_config secho(yaml.dump(explicit_datasets)) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index ab679f1e4b..91218d030c 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -229,36 +229,29 @@ def _resolve_config_credentials( return resolved_configs - def resolve_dataset_pattern( - self, datasets: str | list[str] - ) -> dict[str, Any] | list[dict[str, Any]]: + def resolve_dataset_pattern(self, ds_name: str) -> dict[str, Any]: """Resolve dataset patterns and return resolved configurations based on the existing patterns.""" - datasets_lst = [datasets] if isinstance(datasets, str) else datasets - resolved_configs = [] - - for ds_name in datasets_lst: - matched_pattern = self.match_pattern(ds_name) - if matched_pattern and ds_name not in self._resolved_configs: - pattern_config = self._get_pattern_config(matched_pattern) - ds_config = self._resolve_dataset_config( - ds_name, matched_pattern, copy.deepcopy(pattern_config) + matched_pattern = self.match_pattern(ds_name) + + if matched_pattern and ds_name not in self._resolved_configs: + pattern_config = self._get_pattern_config(matched_pattern) + ds_config = self._resolve_dataset_config( + ds_name, matched_pattern, copy.deepcopy(pattern_config) + ) + + if ( + self._pattern_specificity(matched_pattern) == 0 + and matched_pattern in self._default_pattern + ): + self._logger.warning( + "Config from the dataset factory pattern '%s' in the catalog will be used to " + "override the default dataset creation for '%s'", + matched_pattern, + ds_name, ) + return ds_config - if ( - self._pattern_specificity(matched_pattern) == 0 - and matched_pattern in self._default_pattern - ): - self._logger.warning( - "Config from the dataset factory pattern '%s' in the catalog will be used to " - "override the default dataset creation for '%s'", - matched_pattern, - ds_name, - ) - resolved_configs.append(ds_config) - else: - resolved_configs.append(self._resolved_configs.get(ds_name, None)) - - return resolved_configs[0] if isinstance(datasets, str) else resolved_configs + return self._resolved_configs.get(ds_name, {}) def add_runtime_patterns(self, dataset_patterns: Patterns) -> None: """Add new runtime patterns and re-sort them.""" diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 475c18a148..420f8857c8 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -326,10 +326,10 @@ def _get_dataset( ) -> AbstractDataset: ds_config = self._config_resolver.resolve_dataset_pattern(dataset_name) - if dataset_name not in self._datasets and ds_config is not None: + if dataset_name not in self._datasets and ds_config: ds = AbstractDataset.from_config( dataset_name, - ds_config, # type: ignore[arg-type] + ds_config, self._load_versions.get(dataset_name), self._save_version, ) From f5a7992a57b5407da8504be84eaf2a6888bce84f Mon Sep 17 00:00:00 2001 From: Elena Khaustova Date: Wed, 11 Sep 2024 15:34:40 +0100 Subject: [PATCH 77/77] Fixed linter Signed-off-by: Elena Khaustova --- kedro/io/catalog_config_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/io/catalog_config_resolver.py b/kedro/io/catalog_config_resolver.py index 91218d030c..97ffbadd5f 100644 --- a/kedro/io/catalog_config_resolver.py +++ b/kedro/io/catalog_config_resolver.py @@ -249,7 +249,7 @@ def resolve_dataset_pattern(self, ds_name: str) -> dict[str, Any]: matched_pattern, ds_name, ) - return ds_config + return ds_config # type: ignore[no-any-return] return self._resolved_configs.get(ds_name, {})