Avoid exponential call to rewrite dataset names when creating `_Froze…

…nDatasets` (#953)
kedro-org · Oct 26, 2021 · 8b06dc6 · 8b06dc6
1 parent f333099
commit 8b06dc6
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 7 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -19,6 +19,7 @@
 * Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
 * Relaxed the bounds on the `plotly` requirement for `plotly.PlotlyDataSet`.
 * `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).
+* Fixed slow startup because of catalog processing by reducing the exponential growth of extra processing during `_FrozenDatasets` creations.
 
 ## Minor breaking changes to the API
 

diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py
@@ -56,6 +56,7 @@
 
 CATALOG_KEY = "catalog"
 CREDENTIALS_KEY = "credentials"
+WORDS_REGEX_PATTERN = re.compile(r"\W+")
 
 
 def _get_credentials(
@@ -120,17 +121,31 @@ def _sub_nonword_chars(data_set_name: str) -> str:
     Returns:
         The name used in `DataCatalog.datasets`.
     """
-    return re.sub(r"\W+", "__", data_set_name)
+    return re.sub(WORDS_REGEX_PATTERN, "__", data_set_name)
 
 
 class _FrozenDatasets:
     """Helper class to access underlying loaded datasets"""
 
-    def __init__(self, datasets):
-        # Non-word characters in dataset names are replaced with `__`
-        # for easy access to transcoded/prefixed datasets.
-        datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()}
-        self.__dict__.update(**datasets)
+    def __init__(
+        self,
+        *datasets_collections: Union["_FrozenDatasets", Dict[str, AbstractDataSet]],
+    ):
+        """Return a _FrozenDatasets instance from some datasets collections.
+        Each collection could either be another _FrozenDatasets or a dictionary.
+        """
+        for collection in datasets_collections:
+            if isinstance(collection, _FrozenDatasets):
+                self.__dict__.update(collection.__dict__)
+            else:
+                # Non-word characters in dataset names are replaced with `__`
+                # for easy access to transcoded/prefixed datasets.
+                self.__dict__.update(
+                    {
+                        _sub_nonword_chars(dataset_name): dataset
+                        for dataset_name, dataset in collection.items()
+                    }
+                )
 
     # Don't allow users to add/change attributes on the fly
     def __setattr__(self, key, value):
@@ -524,7 +539,7 @@ def add(
                 )
         self._data_sets[data_set_name] = data_set
         self._transformers[data_set_name] = list(self._default_transformers)
-        self.datasets = _FrozenDatasets(self._data_sets)
+        self.datasets = _FrozenDatasets(self.datasets, {data_set_name: data_set})
 
     def add_all(
         self, data_sets: Dict[str, AbstractDataSet], replace: bool = False

diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py
@@ -302,6 +302,18 @@ def test_adding_datasets_not_allowed(self, data_catalog_from_config):
         with pytest.raises(AttributeError, match=pattern):
             data_catalog_from_config.datasets.new_dataset = None
 
+    def test_add_feed_dict_should_grow_linearly(self, mocker, data_catalog_from_config):
+        """Check number of calls to `_sub_nonword_chars` when adding feed dict
+        should grow linearly with the number of keys in the dict.
+        Simulate this issue: https://github.com/quantumblacklabs/kedro/issues/951
+        """
+        mock_sub_nonword_chars = mocker.patch(
+            "kedro.io.data_catalog._sub_nonword_chars"
+        )
+        feed_dict = {"key1": "val1", "key2": "val2", "key3": "val3", "key4": "val4"}
+        data_catalog_from_config.add_feed_dict(feed_dict)
+        assert mock_sub_nonword_chars.call_count == len(feed_dict)
+
     def test_mutating_datasets_not_allowed(self, data_catalog_from_config):
         """Check error if user tries to update the datasets attribute"""
         pattern = "Please change datasets through configuration."