bugfix: Support repeated source columns in match_values()

This commit fixes an edge case that breaks match_values() function when the input contains repeated source columns. Previously, the matches were computed for only one of the repeated mappings. This fix allows the output of top_matches(), which contains repeated source values, to be passed as a parameter of the match_values() function.
VIDA-NYU · Jul 25, 2024 · fe7eeff · fe7eeff
1 parent 9ed85c4
commit fe7eeff
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 5 deletions.
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -265,8 +265,10 @@ def match_values(
             "containing the 'source' and 'target' columns."
         )
 
-    column_mapping_dict = mapping_df.set_index("source")["target"].to_dict()
-    for source_column in column_mapping_dict.keys():
+    column_mapping_list = mapping_df.to_dict(orient="records")
+
+    for mapping in column_mapping_list:
+        source_column = mapping["source"]
         if source_column not in source.columns:
             raise ValueError(
                 f"The source column '{source_column}' is not present in the source dataset."
@@ -286,7 +288,7 @@ def match_values(
         )
 
     value_matcher = ValueMatchers.get_instance(method)
-    matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
+    matches = _match_values(source, target_domain, column_mapping_list, value_matcher)
 
     result = [
         _value_matching_result_to_df(matching_result) for matching_result in matches
@@ -332,13 +334,14 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat
 def _match_values(
     dataset: pd.DataFrame,
     target_domain: Dict[str, Optional[List[str]]],
-    column_mapping: Dict[str, str],
+    column_mapping: List[Dict],
     value_matcher: BaseValueMatcher,
 ) -> List[ValueMatchingResult]:
 
     mapping_results: List[ValueMatchingResult] = []
 
-    for source_column, target_column in column_mapping.items():
+    for mapping in column_mapping:
+        source_column, target_column = mapping["source"], mapping["target"]
 
         # 1. Select candidate columns for value mapping
         target_domain_list = target_domain[target_column]

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -268,3 +268,40 @@ def test_end_to_end_api_integration():
     # values must be mapped according the provide user_mappings
     assert "tgt_column" in df_mapped.columns
     assert df_mapped["tgt_column"].tolist() == ["APPLE", "BANANA", "ORANGE", None]
+
+
+def test_top_matches_integration():
+    # given
+    df_source = pd.DataFrame(
+        {"fruits": ["Red Apple", "Banana", "Oorange", "Strawberry"]}
+    )
+    df_target = pd.DataFrame(
+        {
+            "fruit_types": ["apple", "banana", "orange", "kiwi", "grapes"],
+            "fruit_names": ["apple", "banana", "melon", "kiwi", "grapes"],
+            "fruit_id": ["1", "2", "3", "4", "5"],
+        }
+    )
+
+    # when
+    df_matches = bdi.top_matches(df_source, target=df_target)
+
+    # then
+    assert len(df_matches.index) == 3
+    assert "source" in df_matches.columns
+    assert "target" in df_matches.columns
+    assert "similarity" in df_matches.columns
+
+    # when
+    df_matches = bdi.match_values(
+        df_source, df_target, column_mapping=df_matches, method="tfidf"
+    )
+    assert isinstance(df_matches, list)
+    assert len(df_matches) == 3
+    for df in df_matches:
+        assert isinstance(df, pd.DataFrame)
+        assert "source" in df.columns
+        assert "target" in df.columns
+        assert "similarity" in df.columns
+        assert df.attrs["source"] == "fruits"
+        assert df.attrs["target"] in ["fruit_types", "fruit_names", "fruit_id"]