Skip to content

Commit

Permalink
refactor: Merge preview_value_mappings() and match_values()
Browse files Browse the repository at this point in the history
The commit simplifies the API by removing preview_value_mappings()
and merging its functionality into match_values(). The functionality
that was provided by preview_value_mappings() is now fully supportted
by match_values().
  • Loading branch information
aecio committed Jul 17, 2024
1 parent 0f187ab commit e6881d1
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 99 deletions.
163 changes: 81 additions & 82 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,9 @@ class ValueMatchingResult(TypedDict):
def match_values(
source: pd.DataFrame,
target: Union[str, pd.DataFrame],
column_mapping: pd.DataFrame,
column_mapping: Union[Tuple[str, str], pd.DataFrame],
method: str = DEFAULT_VALUE_MATCHING_METHOD,
) -> List[ValueMatchingResult]:
) -> Union[pd.DataFrame, List[Dict]]:
"""
Finds matches between column values from the source dataset and column
values of the target domain (a pd.DataFrame or a standard dictionary such
Expand All @@ -282,10 +282,18 @@ def match_values(
Args:
source (pd.DataFrame): The source dataset containing the columns to be
matched.
target (Union[str, pd.DataFrame]): The target domain to match the
values to. It can be either a DataFrame or a standard vocabulary name.
column_mapping (pd.DataFrame): A DataFrame containing the mapping
between source and target columns.
column_mapping (Union[Tuple[str, str], pd.DataFrame]): A tuple or a
DataFrame containing the mappings between source and target columns.
- If a tuple is provided, it should contain two strings where the first
is the source column and the second is the target column.
- If a DataFrame is provided, it should contain 'source' and 'target'
column names where each row specifies a column mapping.
method (str, optional): The name of the method to use for value
matching.
Expand All @@ -299,20 +307,36 @@ def match_values(
ValueError: If the target is neither a DataFrame nor a standard vocabulary name.
ValueError: If the source column is not present in the source dataset.
"""
if not all(k in column_mapping.columns for k in ["source", "target"]):
if isinstance(column_mapping, pd.DataFrame):
if not all(k in column_mapping.columns for k in ["source", "target"]):
raise ValueError(
"The column_mapping DataFrame must contain 'source' and 'target' columns."
)
mapping_df = column_mapping
elif isinstance(column_mapping, tuple):
mapping_df = pd.DataFrame(
[
{
"source": column_mapping[0],
"target": column_mapping[1],
}
]
)
else:
raise ValueError(
"The column_mapping DataFrame must contain 'source' and 'target' columns."
"The column_mapping must be a DataFrame or a tuple of two strings "
"containing the 'source' and 'target' columns."
)

column_mapping_dict = column_mapping.set_index("source")["target"].to_dict()
column_mapping_dict = mapping_df.set_index("source")["target"].to_dict()
for source_column in column_mapping_dict.keys():
if source_column not in source.columns:
raise ValueError(
f"The source column '{source_column}' is not present in the source dataset."
)

if isinstance(target, str) and target == "gdc":
column_names = column_mapping["target"].unique().tolist()
column_names = mapping_df["target"].unique().tolist()
target_domain = get_gdc_data(column_names)
elif isinstance(target, pd.DataFrame):
target_domain = {
Expand All @@ -326,7 +350,49 @@ def match_values(

value_matcher = ValueMatchers.get_instance(method)
matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
return matches

result = [
{
"source": matching_result["source"],
"target": matching_result["target"],
"coverage": matching_result["coverage"],
"matches": _value_matching_result_to_df(matching_result),
}
for matching_result in matches
]

if isinstance(column_mapping, tuple):
# If only a single mapping is provided (as a tuple), we return the result
# directly as a DataFrame to make it easier to display it in notebooks.
assert len(result) == 1
assert isinstance(result[0]["matches"], pd.DataFrame)
return result[0]["matches"]
else:
return result


def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.DataFrame:
"""
Transforms the list of matches and unmatched values into a DataFrame.
"""
matches_df = pd.DataFrame(
data=matching_result["matches"],
columns=["source", "target", "similarity"],
)

unmatched_values = matching_result["unmatch_values"]
unmatched_df = pd.DataFrame(
data=list(
zip(
unmatched_values,
[None] * len(unmatched_values),
[None] * len(unmatched_values),
)
),
columns=["source", "target", "similarity"],
)

return pd.concat([matches_df, unmatched_df], ignore_index=True)


def _match_values(
Expand Down Expand Up @@ -401,73 +467,6 @@ def _skip_values(unique_values: np.ndarray, max_length: int = 50):
return False


def preview_value_mappings(
dataset: pd.DataFrame,
column_mapping: Union[Tuple[str, str], pd.DataFrame],
target: Union[str, pd.DataFrame] = "gdc",
method: str = "tfidf",
) -> List[Dict]:
"""
Print the value mappings in a human-readable format.
"""
if isinstance(column_mapping, pd.DataFrame):
mapping_df = column_mapping
elif isinstance(column_mapping, tuple):
mapping_df = pd.DataFrame(
[
{
"source": column_mapping[0],
"target": column_mapping[1],
}
]
)
else:
raise ValueError(
"The column_mapping must be a DataFrame or a tuple of two strings."
)

value_mappings = match_values(
dataset, target=target, column_mapping=mapping_df, method=method
)

result = []
for matching_result in value_mappings:

# transform matches and unmatched values into DataFrames
matches_df = pd.DataFrame(
data=matching_result["matches"],
columns=["source", "target", "similarity"],
)

unmatched_values = matching_result["unmatch_values"]
unmatched_df = pd.DataFrame(
data=list(
zip(
unmatched_values,
[None] * len(unmatched_values),
[None] * len(unmatched_values),
)
),
columns=["source", "target", "similarity"],
)

result.append(
{
"source": matching_result["source"],
"target": matching_result["target"],
"mapping": pd.concat([matches_df, unmatched_df], ignore_index=True),
}
)

if isinstance(column_mapping, tuple):
# If only a single mapping is provided (as a tuple), we return the result
# directly as a DataFrame to make it easier to display it in notebooks.
assert len(result) == 1
return result[0]["mapping"]
else:
return result


def preview_domain(
dataset: Union[str, pd.DataFrame],
column: str,
Expand Down Expand Up @@ -684,16 +683,16 @@ def create_mapper(
# so call this funtion recursively create it
return create_mapper(input["mapper"])

# This could be the ouput of match_values(), so we can create a
# DictionaryMapper based on the value matches
# This could be the a list of value matches (i.e., ValueMatch
# or tuple(source, target)) provided by the user
if "matches" in input and isinstance(input["matches"], List):
return _create_mapper_from_value_matches(input["matches"])

# This could be the ouput of preview_value_mappings(), so we can
# create a DictionaryMapper based on the value matches
if "mapping" in input and isinstance(input["mapping"], pd.DataFrame):
if "matches" in input and isinstance(input["matches"], pd.DataFrame):
# This could be the ouput of match_values(), so we can
# create a DictionaryMapper based on the value matches
return DictionaryMapper(
input["mapping"].set_index("source")["target"].to_dict()
input["matches"].set_index("source")["target"].to_dict()
)

# This could be the output of match_schema(), but the user did not
Expand Down
6 changes: 3 additions & 3 deletions examples/analyzing_one_attribute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We can find the matches of these values using the method `preview_value_mappings`."
"We can find the matches of these values using the function `match_values()`."
]
},
{
Expand Down Expand Up @@ -957,7 +957,7 @@
}
],
"source": [
"value_mappings = bdi.preview_value_mappings(\n",
"value_mappings = bdi.match_values(\n",
" dataset,\n",
" column_mapping=('FIGO_stage', 'figo_stage'),\n",
" target='gdc'\n",
Expand Down Expand Up @@ -1002,7 +1002,7 @@
" display(preview_domain_target)\n",
" \n",
" print(f'Value mappings {column_mapping}:')\n",
" value_mappings = bdi.preview_value_mappings(dataset, column_mapping, target=target)\n",
" value_mappings = bdi.match_values(dataset, target=target, column_mapping=column_mapping)\n",
" display(value_mappings)\n",
" \n",
" return column_mapping, value_mappings"
Expand Down
20 changes: 10 additions & 10 deletions examples/getting-started.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,7 @@
"### Finding correct value mappings\n",
"\n",
"After finding the correct column, we need to find appropriate value mappings. \n",
"Using `preview_value_mappings()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n",
"Using `match_values()`, we can inspect what the possible value mappings for this would look like after the harmonization.\n",
"\n",
"`bdi-kit` implements multiple methods for value mapping discovery, including:\n",
"\n",
Expand Down Expand Up @@ -1469,7 +1469,7 @@
}
],
"source": [
"bdi.preview_value_mappings(\n",
"bdi.match_values(\n",
" dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"edit_distance\"\n",
")"
]
Expand Down Expand Up @@ -1548,7 +1548,7 @@
}
],
"source": [
"bdi.preview_value_mappings(\n",
"bdi.match_values(\n",
" dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"tfidf\"\n",
")"
]
Expand Down Expand Up @@ -1627,7 +1627,7 @@
}
],
"source": [
"bdi.preview_value_mappings(\n",
"bdi.match_values(\n",
" dataset, column_mapping=(\"Histologic_type\", \"primary_diagnosis\"), target=\"gdc\", method=\"embedding\"\n",
")"
]
Expand Down Expand Up @@ -2281,7 +2281,7 @@
}
],
"source": [
"mappings = bdi.preview_value_mappings(\n",
"mappings = bdi.match_values(\n",
" dataset,\n",
" column_mapping=column_mappings,\n",
" target=\"gdc\",\n",
Expand All @@ -2290,7 +2290,7 @@
"\n",
"for mapping in mappings:\n",
" print(f\"{mapping['source']} => {mapping['target']}\")\n",
" display(mapping[\"mapping\"])\n",
" display(mapping[\"matches\"])\n",
" print(\"\")"
]
},
Expand All @@ -2305,7 +2305,7 @@
"- Ethnicity\n",
"- Tumor_Site\n",
"\n",
"For race, we need to fix: `nan` -> `merican indian or alaska native`."
"For race, we need to fix: `nan` -> `american indian or alaska native`."
]
},
{
Expand Down Expand Up @@ -2389,7 +2389,7 @@
}
],
"source": [
"race_vmap = bdi.preview_value_mappings(\n",
"race_vmap = bdi.match_values(\n",
" dataset,\n",
" column_mapping=(\"Race\", \"race\"),\n",
" target=\"gdc\",\n",
Expand Down Expand Up @@ -2557,7 +2557,7 @@
}
],
"source": [
"ethinicity_vmap = bdi.preview_value_mappings(\n",
"ethinicity_vmap = bdi.match_values(\n",
" dataset,\n",
" column_mapping=(\"Ethnicity\", \"ethnicity\"),\n",
" target=\"gdc\",\n",
Expand Down Expand Up @@ -2711,7 +2711,7 @@
}
],
"source": [
"bdi.preview_value_mappings(\n",
"bdi.match_values(\n",
" dataset, column_mapping=(\"Tumor_Site\", \"tissue_or_organ_of_origin\"), target=\"gdc\", method=\"tfidf\"\n",
")"
]
Expand Down
8 changes: 4 additions & 4 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def test_value_mapping_dataframe():
assert mapping is not None
assert mapping["source"] == "src_column"
assert mapping["target"] == "tgt_column"
assert isinstance(mapping["matches"], list)
assert len(mapping["matches"]) == 3
assert isinstance(mapping["matches"], pd.DataFrame)
assert len(mapping["matches"].index) == len(df_source.index)


def test_end_to_end_api_integration():
Expand Down Expand Up @@ -248,8 +248,8 @@ def test_end_to_end_api_integration():
assert mapping is not None
assert mapping["source"] == "src_column"
assert mapping["target"] == "tgt_column"
assert isinstance(mapping["matches"], list)
assert len(mapping["matches"]) == 3
assert isinstance(mapping["matches"], pd.DataFrame)
assert len(mapping["matches"]) == len(df_source)

# when: pass output of match_values() to materialize_mapping(),
df_mapped = bdi.materialize_mapping(df_source, value_mappings)
Expand Down

0 comments on commit e6881d1

Please sign in to comment.