From a2fe40e77ab5b9e94734730542f91537325f419a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Wed, 7 Aug 2024 18:46:28 -0400 Subject: [PATCH] feat: Consistently use np.nan for missing values --- bdikit/api.py | 12 +++++----- .../value_mapping/value_mappers.py | 7 +++--- tests/test_api.py | 7 +++--- tests/test_value_mapping.py | 24 +++++++++++++++++++ 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/bdikit/api.py b/bdikit/api.py index eecee604..2bc833ec 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -91,7 +91,7 @@ def match_schema( source: pd.DataFrame, target: Union[str, pd.DataFrame] = "gdc", method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD, - method_args: Dict[str, Any] = None, + method_args: Dict[str, Any] = {}, ) -> pd.DataFrame: """ Performs schema mapping between the source table and the given target schema. The @@ -117,8 +117,6 @@ def match_schema( target_table = target if isinstance(method, str): - if method_args is None: - method_args = {} matcher_instance = SchemaMatchers.get_instance(method, **method_args) elif isinstance(method, BaseSchemaMatcher): matcher_instance = method @@ -327,7 +325,9 @@ def match_values( return result -def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.DataFrame: +def _value_matching_result_to_df( + matching_result: ValueMatchingResult, default_unmatched: Any = np.nan +) -> pd.DataFrame: """ Transforms the list of matches and unmatched values into a DataFrame. """ @@ -341,8 +341,8 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat data=list( zip( unmatched_values, - [None] * len(unmatched_values), - [None] * len(unmatched_values), + [default_unmatched] * len(unmatched_values), + [default_unmatched] * len(unmatched_values), ) ), columns=["source", "target", "similarity"], diff --git a/bdikit/mapping_algorithms/value_mapping/value_mappers.py b/bdikit/mapping_algorithms/value_mapping/value_mappers.py index d6ad11d3..3a47dbe5 100644 --- a/bdikit/mapping_algorithms/value_mapping/value_mappers.py +++ b/bdikit/mapping_algorithms/value_mapping/value_mappers.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from typing import Any, Callable from collections import defaultdict @@ -53,12 +54,12 @@ class DictionaryMapper(ValueMapper): values stored in the provided dictionary. """ - def __init__(self, dictionary: dict, missing_data_value: Any = None): - self.dictionary = defaultdict(lambda: missing_data_value, dictionary) + def __init__(self, dictionary: dict, missing_key_value: Any = np.nan): + self.dictionary = defaultdict(lambda: missing_key_value, dictionary) def map(self, input_column: pd.Series) -> pd.Series: """ Transforms the values in the input_column to the values specified in the dictionary provided using the object constructor. """ - return input_column.map(self.dictionary, na_action="ignore") + return input_column.map(self.dictionary, na_action=None) diff --git a/tests/test_api.py b/tests/test_api.py index 6d1be181..d3560cf2 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,4 +1,5 @@ import bdikit as bdi +import numpy as np import pandas as pd from bdikit.mapping_algorithms.value_mapping.value_mappers import ( FunctionValueMapper, @@ -237,7 +238,7 @@ def test_end_to_end_api_integration(): # matching values found during the value matching step assert isinstance(df_mapped, pd.DataFrame) assert "tgt_column" in df_mapped.columns - assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", None] + assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", np.nan] # when: pass output of match_values() to merge_mappings() and then to # materialize_mapping() @@ -247,7 +248,7 @@ def test_end_to_end_api_integration(): # then: the column must be ranamed and values must be mapped assert isinstance(df_mapped, pd.DataFrame) assert "tgt_column" in df_mapped.columns - assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", None] + assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", np.nan] # when: user mappings are specified in merge_mappings() user_mappings = [ @@ -267,7 +268,7 @@ def test_end_to_end_api_integration(): # then: user mappings take precedence, so the column must be ranamed and # values must be mapped according the provide user_mappings assert "tgt_column" in df_mapped.columns - assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", None] + assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", np.nan] def test_top_matches_and_match_values_integration(): diff --git a/tests/test_value_mapping.py b/tests/test_value_mapping.py index ea782adb..5a20f1fb 100644 --- a/tests/test_value_mapping.py +++ b/tests/test_value_mapping.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from bdikit.mapping_algorithms.value_mapping.value_mappers import ( FunctionValueMapper, DictionaryMapper, @@ -30,6 +31,29 @@ def test_dictionary_mapper(): assert mapped_column.eq([1, 2, 3, 4, 5]).all() +def test_dictionary_mapper_missing_key(): + # given + str_column = pd.Series(data=["a", "b", "c", "d", "e", None, np.nan], name="column_str") + + # when + dict_mapper = DictionaryMapper(dictionary={"a": "1", "b": "2", "c": "3", "d": "4"}, missing_key_value=np.nan) + mapped_column = dict_mapper.map(str_column) + # then + assert mapped_column[0:4].tolist() == ['1', '2', '3', '4'] + assert np.isnan(mapped_column[4]) + assert np.isnan(mapped_column[5]) + assert np.isnan(mapped_column[6]) + + # when + dict_mapper = DictionaryMapper(dictionary={"a": 1, "b": 2, "c": 3, "d": 4}, missing_key_value=np.nan) + mapped_column = dict_mapper.map(str_column) + # then + assert mapped_column[0:4].tolist() == [1.0, 2.0, 3.0, 4.0] + assert np.isnan(mapped_column[4]) + assert np.isnan(mapped_column[5]) + assert np.isnan(mapped_column[6]) + + def test_custom_function_mapper(): # given str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")