From 5b4266f8605bf85c2bec42b37d85ba1ac942e94a Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sun, 16 Aug 2020 13:44:35 -0400 Subject: [PATCH 1/9] =?UTF-8?q?Can=20you=20handle=20"jalape=C3=B1o"=3F=20:?= =?UTF-8?q?hot=5Fpepper:?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/io/test_data_catalog.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index c1055ec65e..3d32a5c3d4 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -628,12 +628,15 @@ def test_load_version_on_unversioned_dataset( def test_replacing_non_alphanumeric_characters(self): """Test replacing non alphanumeric characters in datasets names""" csv = CSVDataSet(filepath="abc.csv") - datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv} + datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv} catalog = DataCatalog(data_sets=datasets) assert "ds1@spark" not in catalog.datasets.__dict__ + assert "ds2__spark" not in catalog.datasets.__dict__ assert "ds3.csv" not in catalog.datasets.__dict__ + assert "jalape__o" not in catalog.datasets.__dict__ - assert "ds2_spark" in catalog.datasets.__dict__ assert "ds1__spark" in catalog.datasets.__dict__ + assert "ds2_spark" in catalog.datasets.__dict__ assert "ds3__csv" in catalog.datasets.__dict__ + assert "jalapeño" in catalog.datasets.__dict__ From ffe807a456bd3f10cd0cedcdfb223eb8f6a1cdb7 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sun, 16 Aug 2020 14:04:52 -0400 Subject: [PATCH 2/9] Update data_catalog.py --- kedro/io/data_catalog.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index cf1beb723f..1a039d8edb 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -116,10 +116,7 @@ class _FrozenDatasets: def __init__(self, datasets): # Non-alphanumeric characters (except underscore) in dataset name # are replaced with `__`, for easy access to transcoded/prefixed datasets. - datasets = { - re.sub("[^0-9a-zA-Z_]+", "__", key): value - for key, value in datasets.items() - } + datasets = {re.sub(r"\W+", "__", key): value for key, value in datasets.items()} self.__dict__.update(**datasets) # Don't allow users to add/change attributes on the fly From 9051fb3c826f66f90299b744e64ad42b0b7d0189 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Sun, 16 Aug 2020 14:24:32 -0400 Subject: [PATCH 3/9] Update license_and_headers.py --- tools/license_and_headers.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tools/license_and_headers.py b/tools/license_and_headers.py index 7d4f6604f4..863c9d27cb 100644 --- a/tools/license_and_headers.py +++ b/tools/license_and_headers.py @@ -50,17 +50,6 @@ def files_missing_substring(file_names, substring): if content.strip() and substring not in content: yield file_name - # In some locales Python 3.5 on Windows can't deal with non ascii chars in source files - try: - content.encode("ascii") - except UnicodeError as e: - print( - "Non ascii characters in {} after '{}'".format( - file_name, content[e.start - 30 : e.start] - ) - ) - yield file_name - def main(): with open(LICENSE_MD) as header_f: From 3847064f99c83dec5674fc573d4fd3ffa0156c7f Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 18 Aug 2020 12:27:36 -0400 Subject: [PATCH 4/9] Update RELEASE.md --- RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 0ed61dbd54..1269d26859 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,10 +3,13 @@ ## Major features and improvements ## Bug fixes and other changes +* Improved handling of non-ASCII word characters in dataset names. + - For example, a dataset named `jalapeño` will be accessible as `DataCatalog.datasets.jalapeño` rather than `DataCatalog.datasets.jalape__o`. ## Breaking changes to the API ## Thanks for supporting contributions +[Deepyaman Datta](https://github.com/deepyaman) # Upcoming Release 0.16.5 From 5b78fca36482a81d558b3ddcd8555604608987e1 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 18 Aug 2020 12:31:29 -0400 Subject: [PATCH 5/9] Update data_catalog.py --- kedro/io/data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index 1a039d8edb..f67d4eaee7 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -114,8 +114,8 @@ class _FrozenDatasets: """Helper class to access underlying loaded datasets""" def __init__(self, datasets): - # Non-alphanumeric characters (except underscore) in dataset name - # are replaced with `__`, for easy access to transcoded/prefixed datasets. + # Non-word characters in dataset names are replaced with `__` + # for easy access to transcoded/prefixed datasets. datasets = {re.sub(r"\W+", "__", key): value for key, value in datasets.items()} self.__dict__.update(**datasets) From f2e952662410c2e0194791887652d44d5f226dde Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 18 Aug 2020 12:32:24 -0400 Subject: [PATCH 6/9] Update test_data_catalog.py --- tests/io/test_data_catalog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py index 3d32a5c3d4..503dbf1346 100644 --- a/tests/io/test_data_catalog.py +++ b/tests/io/test_data_catalog.py @@ -625,8 +625,8 @@ def test_load_version_on_unversioned_dataset( with pytest.raises(DataSetError): catalog.load("boats", version="first") - def test_replacing_non_alphanumeric_characters(self): - """Test replacing non alphanumeric characters in datasets names""" + def test_replacing_nonword_characters(self): + """Test replacing non-word characters in dataset names""" csv = CSVDataSet(filepath="abc.csv") datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv} From 7bab74bc6a5c3ae7a9778f95d1b9d96f9a22ad8a Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Fri, 21 Aug 2020 07:03:10 -0400 Subject: [PATCH 7/9] Refactor substitution logic to a reusable function --- kedro/io/data_catalog.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py index f67d4eaee7..7794491f01 100644 --- a/kedro/io/data_catalog.py +++ b/kedro/io/data_catalog.py @@ -110,13 +110,25 @@ def _map_value(key: str, value: Any) -> Any: return {k: _map_value(k, v) for k, v in config.items()} +def _sub_nonword_chars(data_set_name: str) -> str: + """Replace non-word characters in data set names since Kedro 0.16.2. + + Args: + data_set_name: The data set name registered in the data catalog. + + Returns: + The name used in `DataCatalog.datasets`. + """ + return re.sub(r"\W+", "__", data_set_name) + + class _FrozenDatasets: """Helper class to access underlying loaded datasets""" def __init__(self, datasets): # Non-word characters in dataset names are replaced with `__` # for easy access to transcoded/prefixed datasets. - datasets = {re.sub(r"\W+", "__", key): value for key, value in datasets.items()} + datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()} self.__dict__.update(**datasets) # Don't allow users to add/change attributes on the fly From 1913f9e260ff24d42e0b6b180c7a38417e4ce62e Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Thu, 10 Sep 2020 09:19:36 -0600 Subject: [PATCH 8/9] Update RELEASE.md --- RELEASE.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 5004b3fdc7..df19786a84 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -3,25 +3,23 @@ ## Major features and improvements ## Bug fixes and other changes -* Improved handling of non-ASCII word characters in dataset names. - - For example, a dataset named `jalapeño` will be accessible as `DataCatalog.datasets.jalapeño` rather than `DataCatalog.datasets.jalape__o`. ## Breaking changes to the API ## Thanks for supporting contributions -[Deepyaman Datta](https://github.com/deepyaman) - # Upcoming 0.16.6 release ## Major features and improvements ## Bug fixes and other changes +* Improved handling of non-ASCII word characters in dataset names. + - For example, a dataset named `jalapeño` will be accessible as `DataCatalog.datasets.jalapeño` rather than `DataCatalog.datasets.jalape__o`. ## Breaking changes to the API ## Thanks for supporting contributions - +[Deepyaman Datta](https://github.com/deepyaman) # Release 0.16.5 From ecf17e0e77900dc4f060a60d14f23b3f6ad176fb Mon Sep 17 00:00:00 2001 From: Kiyo <921kiyo@users.noreply.github.com> Date: Thu, 10 Sep 2020 17:12:49 +0100 Subject: [PATCH 9/9] Trigger CI