From e6881d13b096e22fa141c5b3d74fd94a60cf27f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Fri, 12 Jul 2024 19:45:02 -0400
Subject: [PATCH] refactor: Merge preview_value_mappings() and match_values()

The commit simplifies the API by removing preview_value_mappings()
and merging its functionality into match_values(). The functionality
that was provided by preview_value_mappings() is now fully supportted
by match_values().
---
 bdikit/api.py                          | 163 ++++++++++++-------------
 examples/analyzing_one_attribute.ipynb |   6 +-
 examples/getting-started.ipynb         |  20 +--
 tests/test_api.py                      |   8 +-
 4 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/bdikit/api.py b/bdikit/api.py
index 465d7b97..20fd4344 100644
--- a/bdikit/api.py
+++ b/bdikit/api.py
@@ -271,9 +271,9 @@ class ValueMatchingResult(TypedDict):
 def match_values(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame],
-    column_mapping: pd.DataFrame,
+    column_mapping: Union[Tuple[str, str], pd.DataFrame],
     method: str = DEFAULT_VALUE_MATCHING_METHOD,
-) -> List[ValueMatchingResult]:
+) -> Union[pd.DataFrame, List[Dict]]:
     """
     Finds matches between column values from the source dataset and column
     values of the target domain (a pd.DataFrame or a standard dictionary such
@@ -282,10 +282,18 @@ def match_values(
     Args:
         source (pd.DataFrame): The source dataset containing the columns to be
           matched.
+
         target (Union[str, pd.DataFrame]): The target domain to match the
           values to. It can be either a DataFrame or a standard vocabulary name.
-        column_mapping (pd.DataFrame): A DataFrame containing the mapping
-          between source and target columns.
+
+        column_mapping (Union[Tuple[str, str], pd.DataFrame]): A tuple or a
+          DataFrame containing the mappings between source and target columns.
+
+          - If a tuple is provided, it should contain two strings where the first
+            is the source column and the second is the target column.
+          - If a DataFrame is provided, it should contain 'source' and 'target'
+            column names where each row specifies a column mapping.
+
         method (str, optional): The name of the method to use for value
           matching.
 
@@ -299,12 +307,28 @@ def match_values(
         ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
         ValueError: If the source column is not present in the source dataset.
     """
-    if not all(k in column_mapping.columns for k in ["source", "target"]):
+    if isinstance(column_mapping, pd.DataFrame):
+        if not all(k in column_mapping.columns for k in ["source", "target"]):
+            raise ValueError(
+                "The column_mapping DataFrame must contain 'source' and 'target' columns."
+            )
+        mapping_df = column_mapping
+    elif isinstance(column_mapping, tuple):
+        mapping_df = pd.DataFrame(
+            [
+                {
+                    "source": column_mapping[0],
+                    "target": column_mapping[1],
+                }
+            ]
+        )
+    else:
         raise ValueError(
-            "The column_mapping DataFrame must contain 'source' and 'target' columns."
+            "The column_mapping must be a DataFrame or a tuple of two strings "
+            "containing the 'source' and 'target' columns."
         )
 
-    column_mapping_dict = column_mapping.set_index("source")["target"].to_dict()
+    column_mapping_dict = mapping_df.set_index("source")["target"].to_dict()
     for source_column in column_mapping_dict.keys():
         if source_column not in source.columns:
             raise ValueError(
@@ -312,7 +336,7 @@ def match_values(
             )
 
     if isinstance(target, str) and target == "gdc":
-        column_names = column_mapping["target"].unique().tolist()
+        column_names = mapping_df["target"].unique().tolist()
         target_domain = get_gdc_data(column_names)
     elif isinstance(target, pd.DataFrame):
         target_domain = {
@@ -326,7 +350,49 @@ def match_values(
 
     value_matcher = ValueMatchers.get_instance(method)
     matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
-    return matches
+
+    result = [
+        {
+            "source": matching_result["source"],
+            "target": matching_result["target"],
+            "coverage": matching_result["coverage"],
+            "matches": _value_matching_result_to_df(matching_result),
+        }
+        for matching_result in matches
+    ]
+
+    if isinstance(column_mapping, tuple):
+        # If only a single mapping is provided (as a tuple), we return the result
+        # directly as a DataFrame to make it easier to display it in notebooks.
+        assert len(result) == 1
+        assert isinstance(result[0]["matches"], pd.DataFrame)
+        return result[0]["matches"]
+    else:
+        return result
+
+
+def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.DataFrame:
+    """
+    Transforms the list of matches and unmatched values into a DataFrame.
+    """
+    matches_df = pd.DataFrame(
+        data=matching_result["matches"],
+        columns=["source", "target", "similarity"],
+    )
+
+    unmatched_values = matching_result["unmatch_values"]
+    unmatched_df = pd.DataFrame(
+        data=list(
+            zip(
+                unmatched_values,
+                [None] * len(unmatched_values),
+                [None] * len(unmatched_values),
+            )
+        ),
+        columns=["source", "target", "similarity"],
+    )
+
+    return pd.concat([matches_df, unmatched_df], ignore_index=True)
 
 
 def _match_values(
@@ -401,73 +467,6 @@ def _skip_values(unique_values: np.ndarray, max_length: int = 50):
         return False
 
 
-def preview_value_mappings(
-    dataset: pd.DataFrame,
-    column_mapping: Union[Tuple[str, str], pd.DataFrame],
-    target: Union[str, pd.DataFrame] = "gdc",
-    method: str = "tfidf",
-) -> List[Dict]:
-    """
-    Print the value mappings in a human-readable format.
-    """
-    if isinstance(column_mapping, pd.DataFrame):
-        mapping_df = column_mapping
-    elif isinstance(column_mapping, tuple):
-        mapping_df = pd.DataFrame(
-            [
-                {
-                    "source": column_mapping[0],
-                    "target": column_mapping[1],
-                }
-            ]
-        )
-    else:
-        raise ValueError(
-            "The column_mapping must be a DataFrame or a tuple of two strings."
-        )
-
-    value_mappings = match_values(
-        dataset, target=target, column_mapping=mapping_df, method=method
-    )
-
-    result = []
-    for matching_result in value_mappings:
-
-        # transform matches and unmatched values into DataFrames
-        matches_df = pd.DataFrame(
-            data=matching_result["matches"],
-            columns=["source", "target", "similarity"],
-        )
-
-        unmatched_values = matching_result["unmatch_values"]
-        unmatched_df = pd.DataFrame(
-            data=list(
-                zip(
-                    unmatched_values,
-                    [None] * len(unmatched_values),
-                    [None] * len(unmatched_values),
-                )
-            ),
-            columns=["source", "target", "similarity"],
-        )
-
-        result.append(
-            {
-                "source": matching_result["source"],
-                "target": matching_result["target"],
-                "mapping": pd.concat([matches_df, unmatched_df], ignore_index=True),
-            }
-        )
-
-    if isinstance(column_mapping, tuple):
-        # If only a single mapping is provided (as a tuple), we return the result
-        # directly as a DataFrame to make it easier to display it in notebooks.
-        assert len(result) == 1
-        return result[0]["mapping"]
-    else:
-        return result
-
-
 def preview_domain(
     dataset: Union[str, pd.DataFrame],
     column: str,
@@ -684,16 +683,16 @@ def create_mapper(
                     # so call this funtion recursively create it
                     return create_mapper(input["mapper"])
 
-            # This could be the ouput of match_values(), so we can create a
-            # DictionaryMapper based on the value matches
+            # This could be the a list of value matches (i.e., ValueMatch
+            # or tuple(source, target)) provided by the user
             if "matches" in input and isinstance(input["matches"], List):
                 return _create_mapper_from_value_matches(input["matches"])
 
-            # This could be the ouput of preview_value_mappings(), so we can
-            # create a DictionaryMapper based on the value matches
-            if "mapping" in input and isinstance(input["mapping"], pd.DataFrame):
+            if "matches" in input and isinstance(input["matches"], pd.DataFrame):
+                # This could be the ouput of match_values(), so we can
+                # create a DictionaryMapper based on the value matches
                 return DictionaryMapper(
-                    input["mapping"].set_index("source")["target"].to_dict()
+                    input["matches"].set_index("source")["target"].to_dict()
                 )
 
             # This could be the output of match_schema(), but the user did not
diff --git a/examples/analyzing_one_attribute.ipynb b/examples/analyzing_one_attribute.ipynb
index 1b6fe70f..f938f004 100644
--- a/examples/analyzing_one_attribute.ipynb
+++ b/examples/analyzing_one_attribute.ipynb
@@ -833,7 +833,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can find the matches of these values using the method `preview_value_mappings`."
+    "We can find the matches of these values using the function `match_values()`."
    ]
   },
   {
@@ -957,7 +957,7 @@
     }
    ],
    "source": [
-    "value_mappings = bdi.preview_value_mappings(\n",
+    "value_mappings = bdi.match_values(\n",
     "        dataset,\n",
     "        column_mapping=('FIGO_stage', 'figo_stage'),\n",
     "        target='gdc'\n",
@@ -1002,7 +1002,7 @@
     "    display(preview_domain_target)\n",
     "    \n",
     "    print(f'Value mappings {column_mapping}:')\n",
-    "    value_mappings = bdi.preview_value_mappings(dataset, column_mapping, target=target)\n",
+    "    value_mappings = bdi.match_values(dataset, target=target, column_mapping=column_mapping)\n",
     "    display(value_mappings)\n",
     "    \n",
     "    return column_mapping, value_mappings"
diff --git a/examples/getting-started.ipynb b/examples/getting-started.ipynb
index 16e5cdd6..9ea9db5e 100644
--- a/examples/getting-started.ipynb
+++ b/examples/getting-started.ipynb
@@ -1384,7 +1384,7 @@
     "### Finding correct value mappings\n",
     "\n",
     "After finding the correct column, we need to find appropriate value mappings. \n",
-    "Using `preview_value_mappings()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n",
+    "Using `match_values()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n",
     "\n",
     "`bdi-kit` implements multiple methods for value mapping discovery, including:\n",
     "\n",
@@ -1469,7 +1469,7 @@
     }
    ],
    "source": [
-    "bdi.preview_value_mappings(\n",
+    "bdi.match_values(\n",
     "    dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"edit_distance\"\n",
     ")"
    ]
@@ -1548,7 +1548,7 @@
     }
    ],
    "source": [
-    "bdi.preview_value_mappings(\n",
+    "bdi.match_values(\n",
     "    dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"tfidf\"\n",
     ")"
    ]
@@ -1627,7 +1627,7 @@
     }
    ],
    "source": [
-    "bdi.preview_value_mappings(\n",
+    "bdi.match_values(\n",
     "    dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"embedding\"\n",
     ")"
    ]
@@ -2281,7 +2281,7 @@
     }
    ],
    "source": [
-    "mappings = bdi.preview_value_mappings(\n",
+    "mappings = bdi.match_values(\n",
     "    dataset,\n",
     "    column_mapping=column_mappings,\n",
     "    target=\"gdc\",\n",
@@ -2290,7 +2290,7 @@
     "\n",
     "for mapping in mappings:\n",
     "    print(f\"{mapping['source']} => {mapping['target']}\")\n",
-    "    display(mapping[\"mapping\"])\n",
+    "    display(mapping[\"matches\"])\n",
     "    print(\"\")"
    ]
   },
@@ -2305,7 +2305,7 @@
     "- Ethnicity\n",
     "- Tumor_Site\n",
     "\n",
-    "For race, we need to fix: `nan` -> `merican indian or alaska native`."
+    "For race, we need to fix: `nan` -> `american indian or alaska native`."
    ]
   },
   {
@@ -2389,7 +2389,7 @@
     }
    ],
    "source": [
-    "race_vmap = bdi.preview_value_mappings(\n",
+    "race_vmap = bdi.match_values(\n",
     "    dataset,\n",
     "    column_mapping=(\"Race\", \"race\"),\n",
     "    target=\"gdc\",\n",
@@ -2557,7 +2557,7 @@
     }
    ],
    "source": [
-    "ethinicity_vmap = bdi.preview_value_mappings(\n",
+    "ethinicity_vmap = bdi.match_values(\n",
     "    dataset,\n",
     "    column_mapping=(\"Ethnicity\", \"ethnicity\"),\n",
     "    target=\"gdc\",\n",
@@ -2711,7 +2711,7 @@
     }
    ],
    "source": [
-    "bdi.preview_value_mappings(\n",
+    "bdi.match_values(\n",
     "    dataset, column_mapping=(\"Tumor_Site\", \"tissue_or_organ_of_origin\"), target=\"gdc\", method=\"tfidf\"\n",
     ")"
    ]
diff --git a/tests/test_api.py b/tests/test_api.py
index 4cdd27c1..a7498a98 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -203,8 +203,8 @@ def test_value_mapping_dataframe():
     assert mapping is not None
     assert mapping["source"] == "src_column"
     assert mapping["target"] == "tgt_column"
-    assert isinstance(mapping["matches"], list)
-    assert len(mapping["matches"]) == 3
+    assert isinstance(mapping["matches"], pd.DataFrame)
+    assert len(mapping["matches"].index) == len(df_source.index)
 
 
 def test_end_to_end_api_integration():
@@ -248,8 +248,8 @@ def test_end_to_end_api_integration():
     assert mapping is not None
     assert mapping["source"] == "src_column"
     assert mapping["target"] == "tgt_column"
-    assert isinstance(mapping["matches"], list)
-    assert len(mapping["matches"]) == 3
+    assert isinstance(mapping["matches"], pd.DataFrame)
+    assert len(mapping["matches"]) == len(df_source)
 
     # when: pass output of match_values() to materialize_mapping(),
     df_mapped = bdi.materialize_mapping(df_source, value_mappings)