Skip to content

Commit

Permalink
bugfix: Support repeated source columns in match_values()
Browse files Browse the repository at this point in the history
This commit fixes an edge case that breaks match_values()
function when the input contains repeated source columns.
Previously, the matches were computed for only one of the
repeated mappings. This fix allows the output of top_matches(),
which contains repeated source values, to be passed as a
parameter of the match_values() function.
  • Loading branch information
aecio committed Jul 25, 2024
1 parent 9ed85c4 commit fe7eeff
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 5 deletions.
13 changes: 8 additions & 5 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,10 @@ def match_values(
"containing the 'source' and 'target' columns."
)

column_mapping_dict = mapping_df.set_index("source")["target"].to_dict()
for source_column in column_mapping_dict.keys():
column_mapping_list = mapping_df.to_dict(orient="records")

for mapping in column_mapping_list:
source_column = mapping["source"]
if source_column not in source.columns:
raise ValueError(
f"The source column '{source_column}' is not present in the source dataset."
Expand All @@ -286,7 +288,7 @@ def match_values(
)

value_matcher = ValueMatchers.get_instance(method)
matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
matches = _match_values(source, target_domain, column_mapping_list, value_matcher)

result = [
_value_matching_result_to_df(matching_result) for matching_result in matches
Expand Down Expand Up @@ -332,13 +334,14 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat
def _match_values(
dataset: pd.DataFrame,
target_domain: Dict[str, Optional[List[str]]],
column_mapping: Dict[str, str],
column_mapping: List[Dict],
value_matcher: BaseValueMatcher,
) -> List[ValueMatchingResult]:

mapping_results: List[ValueMatchingResult] = []

for source_column, target_column in column_mapping.items():
for mapping in column_mapping:
source_column, target_column = mapping["source"], mapping["target"]

# 1. Select candidate columns for value mapping
target_domain_list = target_domain[target_column]
Expand Down
37 changes: 37 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,40 @@ def test_end_to_end_api_integration():
# values must be mapped according the provide user_mappings
assert "tgt_column" in df_mapped.columns
assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", None]


def test_top_matches_integration():
# given
df_source = pd.DataFrame(
{"fruits": ["Red Apple", "Banana", "Oorange", "Strawberry"]}
)
df_target = pd.DataFrame(
{
"fruit_types": ["apple", "banana", "orange", "kiwi", "grapes"],
"fruit_names": ["apple", "banana", "melon", "kiwi", "grapes"],
"fruit_id": ["1", "2", "3", "4", "5"],
}
)

# when
df_matches = bdi.top_matches(df_source, target=df_target)

# then
assert len(df_matches.index) == 3
assert "source" in df_matches.columns
assert "target" in df_matches.columns
assert "similarity" in df_matches.columns

# when
df_matches = bdi.match_values(
df_source, df_target, column_mapping=df_matches, method="tfidf"
)
assert isinstance(df_matches, list)
assert len(df_matches) == 3
for df in df_matches:
assert isinstance(df, pd.DataFrame)
assert "source" in df.columns
assert "target" in df.columns
assert "similarity" in df.columns
assert df.attrs["source"] == "fruits"
assert df.attrs["target"] in ["fruit_types", "fruit_names", "fruit_id"]

0 comments on commit fe7eeff

Please sign in to comment.