kedro-org · ankatiyar · Jul 6, 2023 · May 22, 2023 · May 23, 2023 · Jun 2, 2023
@@ -13,6 +13,7 @@ importlib_resources>=1.3  # The `files()` API was introduced in `importlib_resou
 jmespath>=0.9.5, <1.0
 more_itertools~=9.0
 omegaconf~=2.3
+parse~=1.19.0
 pip-tools~=6.5
 pluggy~=1.0.0
 PyYAML>=4.2, <7.0

@@ -11,7 +11,9 @@
 import logging
 import re
 from collections import defaultdict
-from typing import Any
+from typing import Any, Iterable
+
+from parse import parse
 
 from kedro.io.core import (
     AbstractDataSet,
@@ -94,6 +96,20 @@ def _sub_nonword_chars(data_set_name: str) -> str:
     return re.sub(WORDS_REGEX_PATTERN, "__", data_set_name)
 
 
+def _specificity(pattern: str) -> int:
+    """Helper function to check the length of exactly matched characters not inside brackets
+    Example -
+    specificity("{namespace}.companies") = 10
+    specificity("{namespace}.{dataset}") = 1
+    specificity("france.companies") = 16
+    Args:
+        pattern: The factory pattern
+    """
+    # Remove all the placeholders from the pattern
+    result = re.sub(r"\{.*?\}", "", pattern)
+    return len(result)
+
+
 class _FrozenDatasets:
     """Helper class to access underlying loaded datasets"""
 
@@ -141,6 +157,7 @@ def __init__(
         data_sets: dict[str, AbstractDataSet] = None,
         feed_dict: dict[str, Any] = None,
         layers: dict[str, set[str]] = None,
+        dataset_patterns: dict[str, Any] = None,
     ) -> None:
         """``DataCatalog`` stores instances of ``AbstractDataSet``
         implementations to provide ``load`` and ``save`` capabilities from
@@ -170,7 +187,23 @@ def __init__(
         self._data_sets = dict(data_sets or {})
         self.datasets = _FrozenDatasets(self._data_sets)
         self.layers = layers
-
+        # Keep a record of all patterns in the catalog.
+        # {dataset pattern name : dataset pattern body}
+        self.dataset_patterns = dict(dataset_patterns or {})
+        # Sort all the patterns according to the parsing rules -
+        # 1. Decreasing specificity (no of characters outside the brackets)
+        # 2. Decreasing number of placeholders (no of curly brackets)
+        # 3. Alphabetical
+        self._sorted_dataset_patterns = sorted(
+            self.dataset_patterns.keys(),
+            key=lambda pattern: (
+                -(_specificity(pattern)),
+                -pattern.count("{"),
+                pattern,
+            ),
+        )
+        # Cache that stores {name : matched_pattern}
+        self._pattern_matches_cache: dict[str, str] = {}
         # import the feed dict
         if feed_dict:
             self.add_feed_dict(feed_dict)
@@ -257,6 +290,7 @@ class to be loaded is specified with the key ``type`` and their
             >>> catalog.save("boats", df)
         """
         data_sets = {}
+        dataset_patterns = {}
         catalog = copy.deepcopy(catalog) or {}
         credentials = copy.deepcopy(credentials) or {}
         save_version = save_version or generate_timestamp()
@@ -271,35 +305,54 @@ class to be loaded is specified with the key ``type`` and their
 
         layers: dict[str, set[str]] = defaultdict(set)
         for ds_name, ds_config in catalog.items():
-            ds_layer = ds_config.pop("layer", None)
-            if ds_layer is not None:
-                layers[ds_layer].add(ds_name)
-
-            ds_config = _resolve_credentials(ds_config, credentials)
-            data_sets[ds_name] = AbstractDataSet.from_config(
-                ds_name, ds_config, load_versions.get(ds_name), save_version
-            )
+            # Assume that any name with } in it is a dataset factory to be matched.
+            if "}" in ds_name:
+                # Add each factory to the dataset_patterns dict.
+                dataset_patterns[ds_name] = ds_config
+            else:
+                ds_layer = ds_config.pop("layer", None)
+                if ds_layer is not None:
+                    layers[ds_layer].add(ds_name)
 
+                ds_config = _resolve_credentials(ds_config, credentials)
+                data_sets[ds_name] = AbstractDataSet.from_config(
+                    ds_name, ds_config, load_versions.get(ds_name), save_version
+                )
         dataset_layers = layers or None
-        return cls(data_sets=data_sets, layers=dataset_layers)
+        return cls(
+            data_sets=data_sets,
+            layers=dataset_layers,
+            dataset_patterns=dataset_patterns,
+        )
 
     def _get_dataset(
         self, data_set_name: str, version: Version = None, suggest: bool = True
     ) -> AbstractDataSet:
         if data_set_name not in self._data_sets:
-            error_msg = f"DataSet '{data_set_name}' not found in the catalog"
-
-            # Flag to turn on/off fuzzy-matching which can be time consuming and
-            # slow down plugins like `kedro-viz`
-            if suggest:
-                matches = difflib.get_close_matches(
-                    data_set_name, self._data_sets.keys()
-                )
-                if matches:
-                    suggestions = ", ".join(matches)
-                    error_msg += f" - did you mean one of these instead: {suggestions}"
-
-            raise DataSetNotFoundError(error_msg)
+            # When a dataset is "used" in the pipeline that's not in the recorded catalog datasets,
+            # try to match it against the data factories in the catalog. If it's a match,
+            # resolve it to a dataset instance and add it to the catalog, so it only needs
+            # to be matched once and not everytime the dataset is used in the pipeline.
+            if self.exists_in_catalog_config(data_set_name):
+                pattern = self._pattern_matches_cache[data_set_name]
+                matched_dataset = self._resolve_dataset(data_set_name, pattern)
+                self.add(data_set_name, matched_dataset)
+            else:
+                error_msg = f"DataSet '{data_set_name}' not found in the catalog"
+
+                # Flag to turn on/off fuzzy-matching which can be time consuming and
+                # slow down plugins like `kedro-viz`
+                if suggest:
+                    matches = difflib.get_close_matches(
+                        data_set_name, self._data_sets.keys()
+                    )
+                    if matches:
+                        suggestions = ", ".join(matches)
+                        error_msg += (
+                            f" - did you mean one of these instead: {suggestions}"
+                        )
+
+                raise DataSetNotFoundError(error_msg)
 
         data_set = self._data_sets[data_set_name]
         if version and isinstance(data_set, AbstractVersionedDataSet):
@@ -311,6 +364,28 @@ def _get_dataset(
 
         return data_set
 
+    def _resolve_dataset(
+        self, dataset_name: str, matched_pattern: str
+    ) -> AbstractDataSet:
+        """Get resolved AbstractDataSet from a factory config"""
+        result = parse(matched_pattern, dataset_name)
+        template_copy = copy.deepcopy(self.dataset_patterns[matched_pattern])
+        # Resolve the factory config for the dataset
+        for key, value in template_copy.items():
+            if isinstance(value, Iterable) and "}" in value:
+                string_value = str(value)
+                # result.named: gives access to all dict items in the match result.
+                # format_map fills in dict values into a string with {...} placeholders
+                # of the same key name.
+                try:
+                    template_copy[key] = string_value.format_map(result.named)
+                except KeyError as exc:
+                    raise DataSetError(
+                        f"Unable to resolve '{key}' for the pattern '{matched_pattern}'"
+                    ) from exc
+        # Create dataset from catalog template.
+        return AbstractDataSet.from_config(dataset_name, template_copy)
+
     def load(self, name: str, version: str = None) -> Any:
         """Loads a registered data set.
 
@@ -567,16 +642,47 @@ def list(self, regex_search: str | None = None) -> list[str]:
             ) from exc
         return [dset_name for dset_name in self._data_sets if pattern.search(dset_name)]
 
+    def exists_in_catalog_config(self, dataset_name: str) -> bool:
+        """Check if a dataset exists in the catalog as an exact match or if it matches a pattern."""
+        if (
+            dataset_name in self._data_sets
+            or dataset_name in self._pattern_matches_cache
+        ):
+            return True
+        matched_pattern = self.match_name_against_patterns(dataset_name)
+        if matched_pattern:
+            # cache the "dataset_name -> pattern" match
+            self._pattern_matches_cache[dataset_name] = matched_pattern
+            return True
+        return False
+
+    def match_name_against_patterns(self, dataset_name: str) -> str | None:
+        """Match a dataset name against existing patterns"""
+        # Loop through all dataset patterns and check if the given dataset name has a match.
+        for pattern in self._sorted_dataset_patterns:
+            result = parse(pattern, dataset_name)
+            if result:
+                return pattern
+        return None
+
     def shallow_copy(self) -> DataCatalog:
         """Returns a shallow copy of the current object.
 
         Returns:
             Copy of the current object.
         """
-        return DataCatalog(data_sets=self._data_sets, layers=self.layers)
+        return DataCatalog(
+            data_sets=self._data_sets,
+            layers=self.layers,
+            dataset_patterns=self.dataset_patterns,
+        )
 
     def __eq__(self, other):
-        return (self._data_sets, self.layers) == (other._data_sets, other.layers)
+        return (self._data_sets, self.layers, self.dataset_patterns) == (
+            other._data_sets,
+            other.layers,
+            other.dataset_patterns,
+        )
 
     def confirm(self, name: str) -> None:
         """Confirm a dataset by its name.

@@ -74,14 +74,35 @@ def run(
         hook_manager = hook_manager or _NullPluginManager()
         catalog = catalog.shallow_copy()
 
-        unsatisfied = pipeline.inputs() - set(catalog.list())
+        # Check which datasets used in the pipeline aren't in the catalog and don't match
+        # a pattern in the catalog
+        unregistered_ds = [
+            ds
+            for ds in pipeline.data_sets()
+            if not catalog.exists_in_catalog_config(ds)
+        ]
+
+        # Check if there are any input datasets that aren't in the catalog and
+        # don't match a pattern in the catalog.
+        unsatisfied = [
+            input_name
+            for input_name in pipeline.inputs()
+            if input_name in unregistered_ds
+        ]
-        # Check if there are any input datasets that aren't in the catalog and
-        # don't match a pattern in the catalog.
-        unsatisfied = [
-            input_name
-            for input_name in pipeline.inputs()
-            if input_name in unregistered_ds
-        ]
+        unsatisfied = pipeline.inputs() - set(registered_ds)
-        # Check if there are any input datasets that aren't in the catalog and
-        # don't match a pattern in the catalog.
-        unsatisfied = [
-            input_name
-            for input_name in pipeline.inputs()
-            if input_name in unregistered_ds
-        ]
+        unsatisfied = pipeline.inputs() - set(registered_ds)
         if unsatisfied:
             raise ValueError(
                 f"Pipeline input(s) {unsatisfied} not found in the DataCatalog"
             )
 
-        free_outputs = pipeline.outputs() - set(catalog.list())
-        unregistered_ds = pipeline.data_sets() - set(catalog.list())
+        # Check if there's any output datasets that aren't in the catalog and don't match a pattern
+        # in the catalog.
+        free_outputs = [
+            output_name
+            for output_name in pipeline.outputs()
+            if output_name in unregistered_ds
+        ]
+
+        # Create a default dataset for unregistered datasets
         for ds_name in unregistered_ds:
             catalog.add(ds_name, self.create_default_data_set(ds_name))