Skip to content

Commit

Permalink
Avoid exponential call to rewrite dataset names when creating `_Froze…
Browse files Browse the repository at this point in the history
…nDatasets` (#953)
  • Loading branch information
limdauto authored Oct 26, 2021
1 parent f333099 commit 8b06dc6
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 7 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
* Upgraded `pip-tools`, which is used by `kedro build-reqs`, to 6.4. This `pip-tools` version requires `pip>=21.2` while [adding support for `pip>=21.3`](https://github.com/jazzband/pip-tools/pull/1501). To upgrade `pip`, please refer to [their documentation](https://pip.pypa.io/en/stable/installing/#upgrading-pip).
* Relaxed the bounds on the `plotly` requirement for `plotly.PlotlyDataSet`.
* `kedro pipeline package <pipeline>` now raises an error if the `<pipeline>` argument doesn't look like a valid Python module path (e.g. has `/` instead of `.`).
* Fixed slow startup because of catalog processing by reducing the exponential growth of extra processing during `_FrozenDatasets` creations.

## Minor breaking changes to the API

Expand Down
29 changes: 22 additions & 7 deletions kedro/io/data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@

CATALOG_KEY = "catalog"
CREDENTIALS_KEY = "credentials"
WORDS_REGEX_PATTERN = re.compile(r"\W+")


def _get_credentials(
Expand Down Expand Up @@ -120,17 +121,31 @@ def _sub_nonword_chars(data_set_name: str) -> str:
Returns:
The name used in `DataCatalog.datasets`.
"""
return re.sub(r"\W+", "__", data_set_name)
return re.sub(WORDS_REGEX_PATTERN, "__", data_set_name)


class _FrozenDatasets:
"""Helper class to access underlying loaded datasets"""

def __init__(self, datasets):
# Non-word characters in dataset names are replaced with `__`
# for easy access to transcoded/prefixed datasets.
datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()}
self.__dict__.update(**datasets)
def __init__(
self,
*datasets_collections: Union["_FrozenDatasets", Dict[str, AbstractDataSet]],
):
"""Return a _FrozenDatasets instance from some datasets collections.
Each collection could either be another _FrozenDatasets or a dictionary.
"""
for collection in datasets_collections:
if isinstance(collection, _FrozenDatasets):
self.__dict__.update(collection.__dict__)
else:
# Non-word characters in dataset names are replaced with `__`
# for easy access to transcoded/prefixed datasets.
self.__dict__.update(
{
_sub_nonword_chars(dataset_name): dataset
for dataset_name, dataset in collection.items()
}
)

# Don't allow users to add/change attributes on the fly
def __setattr__(self, key, value):
Expand Down Expand Up @@ -524,7 +539,7 @@ def add(
)
self._data_sets[data_set_name] = data_set
self._transformers[data_set_name] = list(self._default_transformers)
self.datasets = _FrozenDatasets(self._data_sets)
self.datasets = _FrozenDatasets(self.datasets, {data_set_name: data_set})

def add_all(
self, data_sets: Dict[str, AbstractDataSet], replace: bool = False
Expand Down
12 changes: 12 additions & 0 deletions tests/io/test_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,18 @@ def test_adding_datasets_not_allowed(self, data_catalog_from_config):
with pytest.raises(AttributeError, match=pattern):
data_catalog_from_config.datasets.new_dataset = None

def test_add_feed_dict_should_grow_linearly(self, mocker, data_catalog_from_config):
"""Check number of calls to `_sub_nonword_chars` when adding feed dict
should grow linearly with the number of keys in the dict.
Simulate this issue: https://github.com/quantumblacklabs/kedro/issues/951
"""
mock_sub_nonword_chars = mocker.patch(
"kedro.io.data_catalog._sub_nonword_chars"
)
feed_dict = {"key1": "val1", "key2": "val2", "key3": "val3", "key4": "val4"}
data_catalog_from_config.add_feed_dict(feed_dict)
assert mock_sub_nonword_chars.call_count == len(feed_dict)

def test_mutating_datasets_not_allowed(self, data_catalog_from_config):
"""Check error if user tries to update the datasets attribute"""
pattern = "Please change datasets through configuration."
Expand Down

0 comments on commit 8b06dc6

Please sign in to comment.