From e6881d13b096e22fa141c5b3d74fd94a60cf27f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Fri, 12 Jul 2024 19:45:02 -0400 Subject: [PATCH] refactor: Merge preview_value_mappings() and match_values() The commit simplifies the API by removing preview_value_mappings() and merging its functionality into match_values(). The functionality that was provided by preview_value_mappings() is now fully supportted by match_values(). --- bdikit/api.py | 163 ++++++++++++------------- examples/analyzing_one_attribute.ipynb | 6 +- examples/getting-started.ipynb | 20 +-- tests/test_api.py | 8 +- 4 files changed, 98 insertions(+), 99 deletions(-) diff --git a/bdikit/api.py b/bdikit/api.py index 465d7b97..20fd4344 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -271,9 +271,9 @@ class ValueMatchingResult(TypedDict): def match_values( source: pd.DataFrame, target: Union[str, pd.DataFrame], - column_mapping: pd.DataFrame, + column_mapping: Union[Tuple[str, str], pd.DataFrame], method: str = DEFAULT_VALUE_MATCHING_METHOD, -) -> List[ValueMatchingResult]: +) -> Union[pd.DataFrame, List[Dict]]: """ Finds matches between column values from the source dataset and column values of the target domain (a pd.DataFrame or a standard dictionary such @@ -282,10 +282,18 @@ def match_values( Args: source (pd.DataFrame): The source dataset containing the columns to be matched. + target (Union[str, pd.DataFrame]): The target domain to match the values to. It can be either a DataFrame or a standard vocabulary name. - column_mapping (pd.DataFrame): A DataFrame containing the mapping - between source and target columns. + + column_mapping (Union[Tuple[str, str], pd.DataFrame]): A tuple or a + DataFrame containing the mappings between source and target columns. + + - If a tuple is provided, it should contain two strings where the first + is the source column and the second is the target column. + - If a DataFrame is provided, it should contain 'source' and 'target' + column names where each row specifies a column mapping. + method (str, optional): The name of the method to use for value matching. @@ -299,12 +307,28 @@ def match_values( ValueError: If the target is neither a DataFrame nor a standard vocabulary name. ValueError: If the source column is not present in the source dataset. """ - if not all(k in column_mapping.columns for k in ["source", "target"]): + if isinstance(column_mapping, pd.DataFrame): + if not all(k in column_mapping.columns for k in ["source", "target"]): + raise ValueError( + "The column_mapping DataFrame must contain 'source' and 'target' columns." + ) + mapping_df = column_mapping + elif isinstance(column_mapping, tuple): + mapping_df = pd.DataFrame( + [ + { + "source": column_mapping[0], + "target": column_mapping[1], + } + ] + ) + else: raise ValueError( - "The column_mapping DataFrame must contain 'source' and 'target' columns." + "The column_mapping must be a DataFrame or a tuple of two strings " + "containing the 'source' and 'target' columns." ) - column_mapping_dict = column_mapping.set_index("source")["target"].to_dict() + column_mapping_dict = mapping_df.set_index("source")["target"].to_dict() for source_column in column_mapping_dict.keys(): if source_column not in source.columns: raise ValueError( @@ -312,7 +336,7 @@ def match_values( ) if isinstance(target, str) and target == "gdc": - column_names = column_mapping["target"].unique().tolist() + column_names = mapping_df["target"].unique().tolist() target_domain = get_gdc_data(column_names) elif isinstance(target, pd.DataFrame): target_domain = { @@ -326,7 +350,49 @@ def match_values( value_matcher = ValueMatchers.get_instance(method) matches = _match_values(source, target_domain, column_mapping_dict, value_matcher) - return matches + + result = [ + { + "source": matching_result["source"], + "target": matching_result["target"], + "coverage": matching_result["coverage"], + "matches": _value_matching_result_to_df(matching_result), + } + for matching_result in matches + ] + + if isinstance(column_mapping, tuple): + # If only a single mapping is provided (as a tuple), we return the result + # directly as a DataFrame to make it easier to display it in notebooks. + assert len(result) == 1 + assert isinstance(result[0]["matches"], pd.DataFrame) + return result[0]["matches"] + else: + return result + + +def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.DataFrame: + """ + Transforms the list of matches and unmatched values into a DataFrame. + """ + matches_df = pd.DataFrame( + data=matching_result["matches"], + columns=["source", "target", "similarity"], + ) + + unmatched_values = matching_result["unmatch_values"] + unmatched_df = pd.DataFrame( + data=list( + zip( + unmatched_values, + [None] * len(unmatched_values), + [None] * len(unmatched_values), + ) + ), + columns=["source", "target", "similarity"], + ) + + return pd.concat([matches_df, unmatched_df], ignore_index=True) def _match_values( @@ -401,73 +467,6 @@ def _skip_values(unique_values: np.ndarray, max_length: int = 50): return False -def preview_value_mappings( - dataset: pd.DataFrame, - column_mapping: Union[Tuple[str, str], pd.DataFrame], - target: Union[str, pd.DataFrame] = "gdc", - method: str = "tfidf", -) -> List[Dict]: - """ - Print the value mappings in a human-readable format. - """ - if isinstance(column_mapping, pd.DataFrame): - mapping_df = column_mapping - elif isinstance(column_mapping, tuple): - mapping_df = pd.DataFrame( - [ - { - "source": column_mapping[0], - "target": column_mapping[1], - } - ] - ) - else: - raise ValueError( - "The column_mapping must be a DataFrame or a tuple of two strings." - ) - - value_mappings = match_values( - dataset, target=target, column_mapping=mapping_df, method=method - ) - - result = [] - for matching_result in value_mappings: - - # transform matches and unmatched values into DataFrames - matches_df = pd.DataFrame( - data=matching_result["matches"], - columns=["source", "target", "similarity"], - ) - - unmatched_values = matching_result["unmatch_values"] - unmatched_df = pd.DataFrame( - data=list( - zip( - unmatched_values, - [None] * len(unmatched_values), - [None] * len(unmatched_values), - ) - ), - columns=["source", "target", "similarity"], - ) - - result.append( - { - "source": matching_result["source"], - "target": matching_result["target"], - "mapping": pd.concat([matches_df, unmatched_df], ignore_index=True), - } - ) - - if isinstance(column_mapping, tuple): - # If only a single mapping is provided (as a tuple), we return the result - # directly as a DataFrame to make it easier to display it in notebooks. - assert len(result) == 1 - return result[0]["mapping"] - else: - return result - - def preview_domain( dataset: Union[str, pd.DataFrame], column: str, @@ -684,16 +683,16 @@ def create_mapper( # so call this funtion recursively create it return create_mapper(input["mapper"]) - # This could be the ouput of match_values(), so we can create a - # DictionaryMapper based on the value matches + # This could be the a list of value matches (i.e., ValueMatch + # or tuple(source, target)) provided by the user if "matches" in input and isinstance(input["matches"], List): return _create_mapper_from_value_matches(input["matches"]) - # This could be the ouput of preview_value_mappings(), so we can - # create a DictionaryMapper based on the value matches - if "mapping" in input and isinstance(input["mapping"], pd.DataFrame): + if "matches" in input and isinstance(input["matches"], pd.DataFrame): + # This could be the ouput of match_values(), so we can + # create a DictionaryMapper based on the value matches return DictionaryMapper( - input["mapping"].set_index("source")["target"].to_dict() + input["matches"].set_index("source")["target"].to_dict() ) # This could be the output of match_schema(), but the user did not diff --git a/examples/analyzing_one_attribute.ipynb b/examples/analyzing_one_attribute.ipynb index 1b6fe70f..f938f004 100644 --- a/examples/analyzing_one_attribute.ipynb +++ b/examples/analyzing_one_attribute.ipynb @@ -833,7 +833,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can find the matches of these values using the method `preview_value_mappings`." + "We can find the matches of these values using the function `match_values()`." ] }, { @@ -957,7 +957,7 @@ } ], "source": [ - "value_mappings = bdi.preview_value_mappings(\n", + "value_mappings = bdi.match_values(\n", " dataset,\n", " column_mapping=('FIGO_stage', 'figo_stage'),\n", " target='gdc'\n", @@ -1002,7 +1002,7 @@ " display(preview_domain_target)\n", " \n", " print(f'Value mappings {column_mapping}:')\n", - " value_mappings = bdi.preview_value_mappings(dataset, column_mapping, target=target)\n", + " value_mappings = bdi.match_values(dataset, target=target, column_mapping=column_mapping)\n", " display(value_mappings)\n", " \n", " return column_mapping, value_mappings" diff --git a/examples/getting-started.ipynb b/examples/getting-started.ipynb index 16e5cdd6..9ea9db5e 100644 --- a/examples/getting-started.ipynb +++ b/examples/getting-started.ipynb @@ -1384,7 +1384,7 @@ "### Finding correct value mappings\n", "\n", "After finding the correct column, we need to find appropriate value mappings. \n", - "Using `preview_value_mappings()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n", + "Using `match_values()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n", "\n", "`bdi-kit` implements multiple methods for value mapping discovery, including:\n", "\n", @@ -1469,7 +1469,7 @@ } ], "source": [ - "bdi.preview_value_mappings(\n", + "bdi.match_values(\n", " dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"edit_distance\"\n", ")" ] @@ -1548,7 +1548,7 @@ } ], "source": [ - "bdi.preview_value_mappings(\n", + "bdi.match_values(\n", " dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"tfidf\"\n", ")" ] @@ -1627,7 +1627,7 @@ } ], "source": [ - "bdi.preview_value_mappings(\n", + "bdi.match_values(\n", " dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"embedding\"\n", ")" ] @@ -2281,7 +2281,7 @@ } ], "source": [ - "mappings = bdi.preview_value_mappings(\n", + "mappings = bdi.match_values(\n", " dataset,\n", " column_mapping=column_mappings,\n", " target=\"gdc\",\n", @@ -2290,7 +2290,7 @@ "\n", "for mapping in mappings:\n", " print(f\"{mapping['source']} => {mapping['target']}\")\n", - " display(mapping[\"mapping\"])\n", + " display(mapping[\"matches\"])\n", " print(\"\")" ] }, @@ -2305,7 +2305,7 @@ "- Ethnicity\n", "- Tumor_Site\n", "\n", - "For race, we need to fix: `nan` -> `merican indian or alaska native`." + "For race, we need to fix: `nan` -> `american indian or alaska native`." ] }, { @@ -2389,7 +2389,7 @@ } ], "source": [ - "race_vmap = bdi.preview_value_mappings(\n", + "race_vmap = bdi.match_values(\n", " dataset,\n", " column_mapping=(\"Race\", \"race\"),\n", " target=\"gdc\",\n", @@ -2557,7 +2557,7 @@ } ], "source": [ - "ethinicity_vmap = bdi.preview_value_mappings(\n", + "ethinicity_vmap = bdi.match_values(\n", " dataset,\n", " column_mapping=(\"Ethnicity\", \"ethnicity\"),\n", " target=\"gdc\",\n", @@ -2711,7 +2711,7 @@ } ], "source": [ - "bdi.preview_value_mappings(\n", + "bdi.match_values(\n", " dataset, column_mapping=(\"Tumor_Site\", \"tissue_or_organ_of_origin\"), target=\"gdc\", method=\"tfidf\"\n", ")" ] diff --git a/tests/test_api.py b/tests/test_api.py index 4cdd27c1..a7498a98 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -203,8 +203,8 @@ def test_value_mapping_dataframe(): assert mapping is not None assert mapping["source"] == "src_column" assert mapping["target"] == "tgt_column" - assert isinstance(mapping["matches"], list) - assert len(mapping["matches"]) == 3 + assert isinstance(mapping["matches"], pd.DataFrame) + assert len(mapping["matches"].index) == len(df_source.index) def test_end_to_end_api_integration(): @@ -248,8 +248,8 @@ def test_end_to_end_api_integration(): assert mapping is not None assert mapping["source"] == "src_column" assert mapping["target"] == "tgt_column" - assert isinstance(mapping["matches"], list) - assert len(mapping["matches"]) == 3 + assert isinstance(mapping["matches"], pd.DataFrame) + assert len(mapping["matches"]) == len(df_source) # when: pass output of match_values() to materialize_mapping(), df_mapped = bdi.materialize_mapping(df_source, value_mappings)