Skip to content

Commit

Permalink
Use a dictionary to send additional args
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Jul 17, 2024
1 parent b6c8280 commit 32f47ed
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 25 deletions.
36 changes: 13 additions & 23 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
DEFAULT_CL_MODEL,
)
from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
CLTopkColumnMatcher,
)
from bdikit.mapping_algorithms.column_mapping.topk_matchers import CLTopkColumnMatcher
from bdikit.mapping_algorithms.value_mapping.algorithms import (
ValueMatch,
BaseValueMatcher,
Expand Down Expand Up @@ -93,7 +91,7 @@ def match_schema(
source: pd.DataFrame,
target: Union[str, pd.DataFrame] = "gdc",
method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
**method_kwargs: Mapping[str, Any],
method_kwargs: Dict[str, Any] = None,
) -> pd.DataFrame:
"""
Performs schema mapping between the source table and the given target schema. The
Expand All @@ -105,7 +103,7 @@ def match_schema(
source (pd.DataFrame): The source table to be mapped.
target (Union[str, pd.DataFrame], optional): The target table or standard data vocabulary. Defaults to "gdc".
method (str, optional): The method used for mapping. Defaults to "coma".
method_kwargs (Mapping[str, Any], optional): The keyword arguments of the method for schema matching.
method_kwargs (Dict[str, Any], optional): The keyword arguments of the method for schema matching.
Returns:
pd.DataFrame: A DataFrame containing the mapping results with columns "source" and "target".
Expand All @@ -119,6 +117,8 @@ def match_schema(
target_table = target

if isinstance(method, str):
if method_kwargs is None:
method_kwargs = {}
matcher_instance = SchemaMatchers.get_instance(method, **method_kwargs)
elif isinstance(method, BaseSchemaMatcher):
matcher_instance = method
Expand Down Expand Up @@ -292,7 +292,7 @@ def match_values(
target: Union[str, pd.DataFrame],
column_mapping: Union[Tuple[str, str], pd.DataFrame],
method: str = DEFAULT_VALUE_MATCHING_METHOD,
**method_kwargs: Mapping[str, Any],
method_kwargs: Dict[str, Any] = None,
) -> Union[pd.DataFrame, List[Dict]]:
"""
Finds matches between column values from the source dataset and column
Expand All @@ -316,7 +316,7 @@ def match_values(
method (str, optional): The name of the method to use for value
matching.
method_kwargs (Mapping[str, Any], optional): The keyword arguments of the
method_kwargs (Dict[str, Any], optional): The keyword arguments of the
method for value matching.
Returns:
Expand All @@ -337,12 +337,7 @@ def match_values(
mapping_df = column_mapping
elif isinstance(column_mapping, tuple):
mapping_df = pd.DataFrame(
[
{
"source": column_mapping[0],
"target": column_mapping[1],
}
]
[{"source": column_mapping[0], "target": column_mapping[1],}]
)
else:
raise ValueError(
Expand Down Expand Up @@ -370,6 +365,8 @@ def match_values(
"The target must be a DataFrame or a standard vocabulary name."
)

if method_kwargs is None:
method_kwargs = {}
value_matcher = ValueMatchers.get_instance(method, **method_kwargs)
matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)

Expand Down Expand Up @@ -398,8 +395,7 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat
Transforms the list of matches and unmatched values into a DataFrame.
"""
matches_df = pd.DataFrame(
data=matching_result["matches"],
columns=["source", "target", "similarity"],
data=matching_result["matches"], columns=["source", "target", "similarity"],
)

unmatched_values = matching_result["unmatch_values"]
Expand Down Expand Up @@ -490,9 +486,7 @@ def _skip_values(unique_values: np.ndarray, max_length: int = 50):


def preview_domain(
dataset: Union[str, pd.DataFrame],
column: str,
limit: Optional[int] = None,
dataset: Union[str, pd.DataFrame], column: str, limit: Optional[int] = None,
) -> pd.DataFrame:
"""
Preview the domain, i.e. set of unique values, column description and value description
Expand Down Expand Up @@ -622,11 +616,7 @@ def check_duplicates(mappings: List):
mapper = create_mapper(mapping)

final_mappings.append(
{
"source": source_column,
"target": target_column,
"mapper": mapper,
}
{"source": source_column, "target": target_column, "mapper": mapper,}
)

return final_mappings
Expand Down
4 changes: 2 additions & 2 deletions examples/changing_parameters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also send additional arguments for the matching algorithm. For instance, we can use BioBert model to improve the results, we just need to set the `model_path` parameter:"
"We can also send additional arguments for the matching algorithm. For instance, we can use BioBert model to improve the results, we just need to set the `model_name` parameter through `method_kwargs`:"
]
},
{
Expand Down Expand Up @@ -627,7 +627,7 @@
" column_mapping=('Tumor_Site', 'tissue_or_organ_of_origin'),\n",
" target='gdc',\n",
" method='embedding',\n",
" model_path='pritamdeka/BioBert-PubMed200kRCT'\n",
" method_kwargs= {'model_name': 'pritamdeka/BioBert-PubMed200kRCT'}\n",
" )\n",
"value_mappings"
]
Expand Down

0 comments on commit 32f47ed

Please sign in to comment.