Skip to content

Commit

Permalink
Gracefully handle non-ASCII chars in dataset names 🌶️ (#487)
Browse files Browse the repository at this point in the history
  • Loading branch information
deepyaman committed Sep 10, 2020
1 parent 5ba5940 commit 93e4cc3
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 23 deletions.
5 changes: 3 additions & 2 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,18 @@

## Thanks for supporting contributions


# Upcoming 0.16.6 release

## Major features and improvements

## Bug fixes and other changes
* Improved handling of non-ASCII word characters in dataset names.
- For example, a dataset named `jalapeño` will be accessible as `DataCatalog.datasets.jalapeño` rather than `DataCatalog.datasets.jalape__o`.

## Breaking changes to the API

## Thanks for supporting contributions

[Deepyaman Datta](https://github.com/deepyaman)

# Release 0.16.5

Expand Down
21 changes: 15 additions & 6 deletions kedro/io/data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,25 @@ def _map_value(key: str, value: Any) -> Any:
return {k: _map_value(k, v) for k, v in config.items()}


def _sub_nonword_chars(data_set_name: str) -> str:
"""Replace non-word characters in data set names since Kedro 0.16.2.
Args:
data_set_name: The data set name registered in the data catalog.
Returns:
The name used in `DataCatalog.datasets`.
"""
return re.sub(r"\W+", "__", data_set_name)


class _FrozenDatasets:
"""Helper class to access underlying loaded datasets"""

def __init__(self, datasets):
# Non-alphanumeric characters (except underscore) in dataset name
# are replaced with `__`, for easy access to transcoded/prefixed datasets.
datasets = {
re.sub("[^0-9a-zA-Z_]+", "__", key): value
for key, value in datasets.items()
}
# Non-word characters in dataset names are replaced with `__`
# for easy access to transcoded/prefixed datasets.
datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()}
self.__dict__.update(**datasets)

# Don't allow users to add/change attributes on the fly
Expand Down
11 changes: 7 additions & 4 deletions tests/io/test_data_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,15 +625,18 @@ def test_load_version_on_unversioned_dataset(
with pytest.raises(DataSetError):
catalog.load("boats", version="first")

def test_replacing_non_alphanumeric_characters(self):
"""Test replacing non alphanumeric characters in datasets names"""
def test_replacing_nonword_characters(self):
"""Test replacing non-word characters in dataset names"""
csv = CSVDataSet(filepath="abc.csv")
datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv}
datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv}

catalog = DataCatalog(data_sets=datasets)
assert "ds1@spark" not in catalog.datasets.__dict__
assert "ds2__spark" not in catalog.datasets.__dict__
assert "ds3.csv" not in catalog.datasets.__dict__
assert "jalape__o" not in catalog.datasets.__dict__

assert "ds2_spark" in catalog.datasets.__dict__
assert "ds1__spark" in catalog.datasets.__dict__
assert "ds2_spark" in catalog.datasets.__dict__
assert "ds3__csv" in catalog.datasets.__dict__
assert "jalapeño" in catalog.datasets.__dict__
11 changes: 0 additions & 11 deletions tools/license_and_headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,6 @@ def files_missing_substring(file_names, substring):
if content.strip() and substring not in content:
yield file_name

# In some locales Python 3.5 on Windows can't deal with non ascii chars in source files
try:
content.encode("ascii")
except UnicodeError as e:
print(
"Non ascii characters in {} after '{}'".format(
file_name, content[e.start - 30 : e.start]
)
)
yield file_name


def main():
with open(LICENSE_MD) as header_f:
Expand Down

0 comments on commit 93e4cc3

Please sign in to comment.