Use a dictionary to send additional args

VIDA-NYU · Jul 17, 2024 · 32f47ed · 32f47ed
1 parent b6c8280
commit 32f47ed
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 25 deletions.
diff --git a/bdikit/api.py b/bdikit/api.py
@@ -33,9 +33,7 @@
 from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
     DEFAULT_CL_MODEL,
 )
-from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
-    CLTopkColumnMatcher,
-)
+from bdikit.mapping_algorithms.column_mapping.topk_matchers import CLTopkColumnMatcher
 from bdikit.mapping_algorithms.value_mapping.algorithms import (
     ValueMatch,
     BaseValueMatcher,
@@ -93,7 +91,7 @@ def match_schema(
     source: pd.DataFrame,
     target: Union[str, pd.DataFrame] = "gdc",
     method: Union[str, BaseSchemaMatcher] = DEFAULT_SCHEMA_MATCHING_METHOD,
-    **method_kwargs: Mapping[str, Any],
+    method_kwargs: Dict[str, Any] = None,
 ) -> pd.DataFrame:
     """
     Performs schema mapping between the source table and the given target schema. The
@@ -105,7 +103,7 @@ def match_schema(
         source (pd.DataFrame): The source table to be mapped.
         target (Union[str, pd.DataFrame], optional): The target table or standard data vocabulary. Defaults to "gdc".
         method (str, optional): The method used for mapping. Defaults to "coma".
-        method_kwargs (Mapping[str, Any], optional): The keyword arguments of the method for schema matching.
+        method_kwargs (Dict[str, Any], optional): The keyword arguments of the method for schema matching.
 
     Returns:
         pd.DataFrame: A DataFrame containing the mapping results with columns "source" and "target".
@@ -119,6 +117,8 @@ def match_schema(
         target_table = target
 
     if isinstance(method, str):
+        if method_kwargs is None:
+            method_kwargs = {}
         matcher_instance = SchemaMatchers.get_instance(method, **method_kwargs)
     elif isinstance(method, BaseSchemaMatcher):
         matcher_instance = method
@@ -292,7 +292,7 @@ def match_values(
     target: Union[str, pd.DataFrame],
     column_mapping: Union[Tuple[str, str], pd.DataFrame],
     method: str = DEFAULT_VALUE_MATCHING_METHOD,
-    **method_kwargs: Mapping[str, Any],
+    method_kwargs: Dict[str, Any] = None,
 ) -> Union[pd.DataFrame, List[Dict]]:
     """
     Finds matches between column values from the source dataset and column
@@ -316,7 +316,7 @@ def match_values(
 
         method (str, optional): The name of the method to use for value
           matching.
-        method_kwargs (Mapping[str, Any], optional): The keyword arguments of the
+        method_kwargs (Dict[str, Any], optional): The keyword arguments of the
             method for value matching.
 
     Returns:
@@ -337,12 +337,7 @@ def match_values(
         mapping_df = column_mapping
     elif isinstance(column_mapping, tuple):
         mapping_df = pd.DataFrame(
-            [
-                {
-                    "source": column_mapping[0],
-                    "target": column_mapping[1],
-                }
-            ]
+            [{"source": column_mapping[0], "target": column_mapping[1],}]
         )
     else:
         raise ValueError(
@@ -370,6 +365,8 @@ def match_values(
             "The target must be a DataFrame or a standard vocabulary name."
         )
 
+    if method_kwargs is None:
+        method_kwargs = {}
     value_matcher = ValueMatchers.get_instance(method, **method_kwargs)
     matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
 
@@ -398,8 +395,7 @@ def _value_matching_result_to_df(matching_result: ValueMatchingResult) -> pd.Dat
     Transforms the list of matches and unmatched values into a DataFrame.
     """
     matches_df = pd.DataFrame(
-        data=matching_result["matches"],
-        columns=["source", "target", "similarity"],
+        data=matching_result["matches"], columns=["source", "target", "similarity"],
     )
 
     unmatched_values = matching_result["unmatch_values"]
@@ -490,9 +486,7 @@ def _skip_values(unique_values: np.ndarray, max_length: int = 50):
 
 
 def preview_domain(
-    dataset: Union[str, pd.DataFrame],
-    column: str,
-    limit: Optional[int] = None,
+    dataset: Union[str, pd.DataFrame], column: str, limit: Optional[int] = None,
 ) -> pd.DataFrame:
     """
     Preview the domain, i.e. set of unique values, column description and value description
@@ -622,11 +616,7 @@ def check_duplicates(mappings: List):
         mapper = create_mapper(mapping)
 
         final_mappings.append(
-            {
-                "source": source_column,
-                "target": target_column,
-                "mapper": mapper,
-            }
+            {"source": source_column, "target": target_column, "mapper": mapper,}
         )
 
     return final_mappings

diff --git a/examples/changing_parameters.ipynb b/examples/changing_parameters.ipynb
@@ -545,7 +545,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can also send additional arguments for the matching algorithm. For instance, we can use BioBert model to improve the results, we just need to set the `model_path` parameter:"
+    "We can also send additional arguments for the matching algorithm. For instance, we can use BioBert model to improve the results, we just need to set the `model_name` parameter through `method_kwargs`:"
    ]
   },
   {
@@ -627,7 +627,7 @@
     "        column_mapping=('Tumor_Site', 'tissue_or_organ_of_origin'),\n",
     "        target='gdc',\n",
     "        method='embedding',\n",
-    "        model_path='pritamdeka/BioBert-PubMed200kRCT'\n",
+    "        method_kwargs= {'model_name': 'pritamdeka/BioBert-PubMed200kRCT'}\n",
     "    )\n",
     "value_mappings"
    ]