VIDA-NYU · aecio · Jun 27, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 7, 2024
diff --git a/bdikit/__init__.py b/bdikit/__init__.py
@@ -1,3 +1,4 @@
 __version__ = "0.2.0.dev0"
 # To shortcut the import path
 from bdikit.api import APIManager
+from bdikit.functional_api import *
diff --git a/bdikit/functional_api.py b/bdikit/functional_api.py
@@ -0,0 +1,138 @@
+from enum import Enum
+from os.path import join, dirname
+from typing import Union, Type, List, Optional
+import pandas as pd
+from bdikit.download import get_cached_model_or_download
+from bdikit.mapping_algorithms.column_mapping.algorithms import (
+    BaseColumnMappingAlgorithm,
+    SimFloodAlgorithm,
+    ComaAlgorithm,
+    CupidAlgorithm,
+    DistributionBasedAlgorithm,
+    JaccardDistanceAlgorithm,
+    GPTAlgorithm,
+)
+from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
+from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
+    ContrastiveLearningAPI,
+)
+
+GDC_DATA_PATH = join(dirname(__file__), "./resource/gdc_table.csv")
+
+
+class ColumnMappingMethod(Enum):
+    SIMFLOOD = ("similarity_flooding", SimFloodAlgorithm)
+    COMA = ("coma", ComaAlgorithm)
+    CUPID = ("cupid", CupidAlgorithm)
+    DISTRIBUTION_BASED = ("distribution_based", DistributionBasedAlgorithm)
+    JACCARD_DISTANCE = ("jaccard_distance", JaccardDistanceAlgorithm)
+    GPT = ("gpt", GPTAlgorithm)
+
+    def __init__(
+        self, method_name: str, method_class: Type[BaseColumnMappingAlgorithm]
+    ):
+        self.method_name = method_name
+        self.method_class = method_class
+
+    @staticmethod
+    def get_instance(method_name: str) -> BaseColumnMappingAlgorithm:
+        methods = {
+            method.method_name: method.method_class for method in ColumnMappingMethod
+        }
+        try:
+            return methods[method_name]()
+        except KeyError:
+            names = ", ".join(list(methods.keys()))
+            raise ValueError(
+                f"The {method_name} algorithm is not supported. "
+                f"Supported algorithms are: {names}"
+            )
+
+
+def match_columns(
+    source: pd.DataFrame,
+    target: Union[str, pd.DataFrame] = "gdc",
+    method: str = ColumnMappingMethod.SIMFLOOD.name,
+) -> pd.DataFrame:
+    """
+    Performs schema mapping between the source table and the given target. The target
+    either is a DataFrame or a string representing a standard data vocabulary.
+    """
+    if isinstance(target, str):
+        target_table = _load_table_for_standard(target)
+    else:
+        target_table = target
+
+    matcher_instance = ColumnMappingMethod.get_instance(method)
+    matches = matcher_instance.map(source, target_table)
+
+    return pd.DataFrame(matches.items(), columns=["source", "target"])
+
+
+def _load_table_for_standard(name: str) -> pd.DataFrame:
+    """
+    Load the table for the given standard data vocabulary. Currently, only the
+    GDC standard is supported.
+    """
+    if name == "gdc":
+        return pd.read_csv(GDC_DATA_PATH)
+    else:
+        raise ValueError(f"The {name} standard is not supported")
+
+
+def top_matches(
+    source: pd.DataFrame,
+    columns: Optional[List[str]] = None,
+    target: Union[str, pd.DataFrame] = "gdc",
+    top_k: int = 10,
+) -> pd.DataFrame:
+    """
+    Returns the top-k matches between the source and target tables.
+    """
+
+    if isinstance(target, str):
+        target_table = _load_table_for_standard(target)
+    else:
+        target_table = target
+
+    if columns is not None and len(columns) > 0:
+        selected_columns = source[columns]
+    else:
+        selected_columns = source
+
+    model_path = get_cached_model_or_download("cl-reducer-v0.1")
+    api = ContrastiveLearningAPI(model_path=model_path, top_k=top_k)
+    _, scopes_json = api.get_recommendations(selected_columns, target=target_table)
+
+    dfs = []
+    for scope in scopes_json:
+        matches = pd.DataFrame(
+            scope["Top k columns"], columns=["matches", "similarity"]
+        )
+        matches["source"] = scope["Candidate column"]
+        matches = matches[["source", "matches", "similarity"]]
+        dfs.append(matches.sort_values(by="similarity", ascending=False))
+
+    return pd.concat(dfs, ignore_index=True)
+
+
+def materialize_mapping(
+    input_dataframe: pd.DataFrame, target: List[dict]
+) -> pd.DataFrame:
+    output_dataframe = pd.DataFrame()
+    for mapping_spec in target:
+        from_column_name = mapping_spec["from"]
+        to_column_name = mapping_spec["to"]
+        value_mapper = mapping_spec["mapper"]
+        output_dataframe[to_column_name] = map_column_values(
+            input_dataframe[from_column_name], to_column_name, value_mapper
+        )
+    return output_dataframe
+
+
+def map_column_values(
+    input_column: pd.Series, target: str, value_mapper: ValueMapper
+) -> pd.Series:
+    new_column = value_mapper.map(input_column)
+    new_column.name = target
+    return new_column
diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py
@@ -14,23 +14,16 @@
 
 
 class BaseColumnMappingAlgorithm:
-    def __init__(self, dataset, global_table):
-        self._dataset = dataset
-        self._global_table = global_table
-
-    def map(self) -> Dict[str, str]:
+    def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]:
         raise NotImplementedError("Subclasses must implement this method")
 
 
 class ValentineColumnMappingAlgorithm(BaseColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table, matcher: BaseMatcher):
-        super().__init__(dataset, global_table)
+    def __init__(self, matcher: BaseMatcher):
         self.matcher = matcher
 
-    def map(self) -> Dict[str, str]:
-        matches: MatcherResults = valentine_match(
-            self._dataset, self._global_table, self.matcher
-        )
+    def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]:
+        matches: MatcherResults = valentine_match(dataset, global_table, self.matcher)
         mappings = {}
         for match in matches.one_to_one():
             dataset_candidate = match[0][1]
@@ -40,42 +33,41 @@ def map(self) -> Dict[str, str]:
 
 
 class SimFloodAlgorithm(ValentineColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table, SimilarityFlooding())
+    def __init__(self):
+        super().__init__(SimilarityFlooding())
 
 
 class ComaAlgorithm(ValentineColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table, Coma())
+    def __init__(self):
+        super().__init__(Coma())
 
 
 class CupidAlgorithm(ValentineColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table, Cupid())
+    def __init__(self):
+        super().__init__(Cupid())
 
 
 class DistributionBasedAlgorithm(ValentineColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table, DistributionBased())
+    def __init__(self):
+        super().__init__(DistributionBased())
 
 
 class JaccardDistanceAlgorithm(ValentineColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table, JaccardDistanceMatcher())
+    def __init__(self):
+        super().__init__(JaccardDistanceMatcher())
 
 
 class GPTAlgorithm(BaseColumnMappingAlgorithm):
-    def __init__(self, dataset, global_table):
-        super().__init__(dataset, global_table)
+    def __init__(self):
         self.client = OpenAI()
 
-    def map(self):
-        global_columns = self._global_table.columns
+    def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame):
+        global_columns = global_table.columns
         labels = ", ".join(global_columns)
-        candidate_columns = self._dataset.columns
+        candidate_columns = dataset.columns
         mappings = {}
         for column in candidate_columns:
-            col = self._dataset[column]
+            col = dataset[column]
             values = col.drop_duplicates().dropna()
             if len(values) > 15:
                 rows = values.sample(15).tolist()

diff --git a/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py b/bdikit/mapping_algorithms/scope_reducing/_algorithms/contrastive_learning/cl_api.py
@@ -1,5 +1,5 @@
 import os
-from typing import List
+from typing import List, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -37,8 +37,16 @@ def load_checkpoint(self, lm="roberta"):
 
         return model
 
-    def get_recommendations(self, table: pd.DataFrame):
-        gdc_ds = pd.read_csv(GDC_TABLE_PATH)
+    def get_recommendations(
+        self, table: pd.DataFrame, target: Optional[Union[str, pd.DataFrame]] = None
+    ):
+        if target is None or (isinstance(target, str) and target == "gdc"):
+            gdc_ds = pd.read_csv(GDC_TABLE_PATH)
+        elif isinstance(target, pd.DataFrame):
+            gdc_ds = target
+        else:
+            raise ValueError("Target must be a DataFrame or 'gdc'")
+
         l_features = self._load_table_tokens(table)
         r_features = self._load_table_tokens(gdc_ds)
         cosine_sim = cosine_similarity(l_features, r_features)

diff --git a/bdikit/mapping_algorithms/value_mapping/value_mappers.py b/bdikit/mapping_algorithms/value_mapping/value_mappers.py
@@ -0,0 +1,62 @@
+import pandas as pd
+
+
+class ValueMapper:
+    """
+    A ValueMapper represents objects that transform the values in a input
+    column to the values from a new output column.
+    """
+
+    def map(self, input_column: pd.Series) -> pd.Series:
+        """
+        Every concrete ValueMapper should implement this method, which takes a
+        pandas Series as input and returns a new pandas Series with transformed
+        values.
+        """
+        pass
+
+
+class IdentityValueMapper(ValueMapper):
+    """
+    A column mapper that maps each value in input column into itself.
+    """
+
+    def map(self, input_column: pd.Series) -> pd.Series:
+        """
+        Simply copies the values in input_column to the output column.
+        """
+        return input_column.copy()
+
+
+class FunctionValueMapper(ValueMapper):
+    """
+    A column mapper that transforms each value in the input column using the
+    provided custom function.
+    """
+
+    def __init__(self, function):
+        self.function = function
+
+    def map(self, input_column: pd.Series) -> pd.Series:
+        """
+        Applies the given function to each value in input_column to generate
+        the output column.
+        """
+        return input_column.map(self.function)
+
+
+class DictionaryMapper(ValueMapper):
+    """
+    A column mapper that transforms each value in the input column using the
+    values stored in the provided dictionary.
+    """
+
+    def __init__(self, dictionary: dict):
+        self.dictionary = dictionary
+
+    def map(self, input_column: pd.Series) -> pd.Series:
+        """
+        Transforms the values in the input_column to the values specified in
+        the dictionary provided using the object constructor.
+        """
+        return input_column.map(self.dictionary)