Skip to content

Commit

Permalink
feat: Consistently use np.nan for missing values
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Aug 7, 2024
1 parent 2ae05e1 commit a2fe40e
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 12 deletions.
12 changes: 6 additions & 6 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def match_schema(
source: pd.DataFrame,
target: Union[str, pd.DataFrame] = "gdc",
method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
method_args: Dict[str, Any] = None,
method_args: Dict[str, Any] = {},
) -> pd.DataFrame:
"""
Performs schema mapping between the source table and the given target schema. The
Expand All @@ -117,8 +117,6 @@ def match_schema(
target_table = target

if isinstance(method, str):
if method_args is None:
method_args = {}
matcher_instance = SchemaMatchers.get_instance(method, **method_args)
elif isinstance(method, BaseSchemaMatcher):
matcher_instance = method
Expand Down Expand Up @@ -327,7 +325,9 @@ def match_values(
return result


def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.DataFrame:
def _value_matching_result_to_df(
matching_result: ValueMatchingResult, default_unmatched: Any = np.nan
) -> pd.DataFrame:
"""
Transforms the list of matches and unmatched values into a DataFrame.
"""
Expand All @@ -341,8 +341,8 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat
data=list(
zip(
unmatched_values,
[None] * len(unmatched_values),
[None] * len(unmatched_values),
[default_unmatched] * len(unmatched_values),
[default_unmatched] * len(unmatched_values),
)
),
columns=["source", "target", "similarity"],
Expand Down
7 changes: 4 additions & 3 deletions bdikit/mapping_algorithms/value_mapping/value_mappers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
from typing import Any, Callable
from collections import defaultdict

Expand Down Expand Up @@ -53,12 +54,12 @@ class DictionaryMapper(ValueMapper):
values stored in the provided dictionary.
"""

def __init__(self, dictionary: dict, missing_data_value: Any = None):
self.dictionary = defaultdict(lambda: missing_data_value, dictionary)
def __init__(self, dictionary: dict, missing_key_value: Any = np.nan):
self.dictionary = defaultdict(lambda: missing_key_value, dictionary)

def map(self, input_column: pd.Series) -> pd.Series:
"""
Transforms the values in the input_column to the values specified in
the dictionary provided using the object constructor.
"""
return input_column.map(self.dictionary, na_action="ignore")
return input_column.map(self.dictionary, na_action=None)
7 changes: 4 additions & 3 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import bdikit as bdi
import numpy as np
import pandas as pd
from bdikit.mapping_algorithms.value_mapping.value_mappers import (
FunctionValueMapper,
Expand Down Expand Up @@ -237,7 +238,7 @@ def test_end_to_end_api_integration():
# matching values found during the value matching step
assert isinstance(df_mapped, pd.DataFrame)
assert "tgt_column" in df_mapped.columns
assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", None]
assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", np.nan]

# when: pass output of match_values() to merge_mappings() and then to
# materialize_mapping()
Expand All @@ -247,7 +248,7 @@ def test_end_to_end_api_integration():
# then: the column must be ranamed and values must be mapped
assert isinstance(df_mapped, pd.DataFrame)
assert "tgt_column" in df_mapped.columns
assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", None]
assert df_mapped["tgt_column"].tolist() == ["apple", "banana", "orange", np.nan]

# when: user mappings are specified in merge_mappings()
user_mappings = [
Expand All @@ -267,7 +268,7 @@ def test_end_to_end_api_integration():
# then: user mappings take precedence, so the column must be ranamed and
# values must be mapped according the provide user_mappings
assert "tgt_column" in df_mapped.columns
assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", None]
assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", np.nan]


def test_top_matches_and_match_values_integration():
Expand Down
24 changes: 24 additions & 0 deletions tests/test_value_mapping.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pandas as pd
import numpy as np
from bdikit.mapping_algorithms.value_mapping.value_mappers import (
FunctionValueMapper,
DictionaryMapper,
Expand Down Expand Up @@ -30,6 +31,29 @@ def test_dictionary_mapper():
assert mapped_column.eq([1, 2, 3, 4, 5]).all()


def test_dictionary_mapper_missing_key():
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e", None, np.nan], name="column_str")

# when
dict_mapper = DictionaryMapper(dictionary={"a": "1", "b": "2", "c": "3", "d": "4"}, missing_key_value=np.nan)
mapped_column = dict_mapper.map(str_column)
# then
assert mapped_column[0:4].tolist() == ['1', '2', '3', '4']
assert np.isnan(mapped_column[4])
assert np.isnan(mapped_column[5])
assert np.isnan(mapped_column[6])

# when
dict_mapper = DictionaryMapper(dictionary={"a": 1, "b": 2, "c": 3, "d": 4}, missing_key_value=np.nan)
mapped_column = dict_mapper.map(str_column)
# then
assert mapped_column[0:4].tolist() == [1.0, 2.0, 3.0, 4.0]
assert np.isnan(mapped_column[4])
assert np.isnan(mapped_column[5])
assert np.isnan(mapped_column[6])


def test_custom_function_mapper():
# given
str_column = pd.Series(data=["a", "b", "c", "d", "e"], name="column_str")
Expand Down

0 comments on commit a2fe40e

Please sign in to comment.