Gracefully handle non-ASCII chars in dataset names 🌶️ (#487)

kedro-org · Sep 10, 2020 · 93e4cc3 · 93e4cc3
1 parent 5ba5940
commit 93e4cc3
Show file tree

Hide file tree

Showing 4 changed files with 25 additions and 23 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -8,17 +8,18 @@
 
 ## Thanks for supporting contributions
 
-
 # Upcoming 0.16.6 release
 
 ## Major features and improvements
 
 ## Bug fixes and other changes
+* Improved handling of non-ASCII word characters in dataset names.
+  - For example, a dataset named `jalapeño` will be accessible as `DataCatalog.datasets.jalapeño` rather than `DataCatalog.datasets.jalape__o`.
 
 ## Breaking changes to the API
 
 ## Thanks for supporting contributions
-
+[Deepyaman Datta](https://github.com/deepyaman)
 
 # Release 0.16.5
 

diff --git a/kedro/io/data_catalog.py b/kedro/io/data_catalog.py
@@ -110,16 +110,25 @@ def _map_value(key: str, value: Any) -> Any:
     return {k: _map_value(k, v) for k, v in config.items()}
 
 
+def _sub_nonword_chars(data_set_name: str) -> str:
+    """Replace non-word characters in data set names since Kedro 0.16.2.
+
+    Args:
+        data_set_name: The data set name registered in the data catalog.
+
+    Returns:
+        The name used in `DataCatalog.datasets`.
+    """
+    return re.sub(r"\W+", "__", data_set_name)
+
+
 class _FrozenDatasets:
     """Helper class to access underlying loaded datasets"""
 
     def __init__(self, datasets):
-        # Non-alphanumeric characters (except underscore) in dataset name
-        # are replaced with `__`, for easy access to transcoded/prefixed datasets.
-        datasets = {
-            re.sub("[^0-9a-zA-Z_]+", "__", key): value
-            for key, value in datasets.items()
-        }
+        # Non-word characters in dataset names are replaced with `__`
+        # for easy access to transcoded/prefixed datasets.
+        datasets = {_sub_nonword_chars(key): value for key, value in datasets.items()}
         self.__dict__.update(**datasets)
 
     # Don't allow users to add/change attributes on the fly

diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py
@@ -625,15 +625,18 @@ def test_load_version_on_unversioned_dataset(
         with pytest.raises(DataSetError):
             catalog.load("boats", version="first")
 
-    def test_replacing_non_alphanumeric_characters(self):
-        """Test replacing non alphanumeric characters in datasets names"""
+    def test_replacing_nonword_characters(self):
+        """Test replacing non-word characters in dataset names"""
         csv = CSVDataSet(filepath="abc.csv")
-        datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv}
+        datasets = {"ds1@spark": csv, "ds2_spark": csv, "ds3.csv": csv, "jalapeño": csv}
 
         catalog = DataCatalog(data_sets=datasets)
         assert "ds1@spark" not in catalog.datasets.__dict__
+        assert "ds2__spark" not in catalog.datasets.__dict__
         assert "ds3.csv" not in catalog.datasets.__dict__
+        assert "jalape__o" not in catalog.datasets.__dict__
 
-        assert "ds2_spark" in catalog.datasets.__dict__
         assert "ds1__spark" in catalog.datasets.__dict__
+        assert "ds2_spark" in catalog.datasets.__dict__
         assert "ds3__csv" in catalog.datasets.__dict__
+        assert "jalapeño" in catalog.datasets.__dict__
diff --git a/tools/license_and_headers.py b/tools/license_and_headers.py
@@ -50,17 +50,6 @@ def files_missing_substring(file_names, substring):
             if content.strip() and substring not in content:
                 yield file_name
 
-            # In some locales Python 3.5 on Windows can't deal with non ascii chars in source files
-            try:
-                content.encode("ascii")
-            except UnicodeError as e:
-                print(
-                    "Non ascii characters in {} after '{}'".format(
-                        file_name, content[e.start - 30 : e.start]
-                    )
-                )
-                yield file_name
-
 
 def main():
     with open(LICENSE_MD) as header_f: