Expose paramters of the matching algorithms

VIDA-NYU · Jul 17, 2024 · 828f6a9 · 828f6a9
1 parent 26c9cf5
commit 828f6a9
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 20 deletions.
diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py
@@ -1,5 +1,5 @@
 import pandas as pd
-from typing import Dict, Optional
+from typing import Dict, Optional, Callable
 from valentine import valentine_match
 from valentine.algorithms import (
     SimilarityFlooding,
@@ -10,6 +10,7 @@
     BaseMatcher,
 )
 from valentine.algorithms.matcher_results import MatcherResults
+from valentine.algorithms.jaccard_distance import StringDistanceFunction
 from openai import OpenAI
 from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
     DEFAULT_CL_MODEL,
@@ -48,28 +49,81 @@ def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, st
 
 
 class SimFloodSchemaMatcher(ValentineSchemaMatcher):
-    def __init__(self):
-        super().__init__(SimilarityFlooding())
+    def __init__(
+        self, coeff_policy: str = "inverse_average", formula: str = "formula_c"
+    ):
+        super().__init__(SimilarityFlooding(coeff_policy=coeff_policy, formula=formula))
 
 
 class ComaSchemaMatcher(ValentineSchemaMatcher):
-    def __init__(self):
-        super().__init__(Coma())
+    def __init__(
+        self, max_n: int = 0, use_instances: bool = False, java_xmx: str = "1024m"
+    ):
+        super().__init__(
+            Coma(max_n=max_n, use_instances=use_instances, java_xmx=java_xmx)
+        )
 
 
 class CupidSchemaMatcher(ValentineSchemaMatcher):
-    def __init__(self):
-        super().__init__(Cupid())
+    def __init__(
+        self,
+        leaf_w_struct: float = 0.2,
+        w_struct: float = 0.2,
+        th_accept: float = 0.7,
+        th_high: float = 0.6,
+        th_low: float = 0.35,
+        c_inc: float = 1.2,
+        c_dec: float = 0.9,
+        th_ns: float = 0.7,
+        parallelism: int = 1,
+    ):
+        super().__init__(
+            Cupid(
+                leaf_w_struct=leaf_w_struct,
+                w_struct=w_struct,
+                th_accept=th_accept,
+                th_high=th_high,
+                th_low=th_low,
+                c_inc=c_inc,
+                c_dec=c_dec,
+                th_ns=th_ns,
+                parallelism=parallelism,
+            )
+        )
 
 
 class DistributionBasedSchemaMatcher(ValentineSchemaMatcher):
-    def __init__(self):
-        super().__init__(DistributionBased())
+    def __init__(
+        self,
+        threshold1: float = 0.15,
+        threshold2: float = 0.15,
+        quantiles: int = 256,
+        process_num: int = 1,
+    ):
+        super().__init__(
+            DistributionBased(
+                threshold1=threshold1,
+                threshold2=threshold2,
+                quantiles=quantiles,
+                process_num=process_num,
+            )
+        )
 
 
 class JaccardSchemaMatcher(ValentineSchemaMatcher):
-    def __init__(self):
-        super().__init__(JaccardDistanceMatcher())
+    def __init__(
+        self,
+        threshold_dist: float = 0.8,
+        distance_fun: Callable[[str, str], float] = StringDistanceFunction.Levenshtein,
+        process_num: int = 1,
+    ):
+        super().__init__(
+            JaccardDistanceMatcher(
+                threshold_dist=threshold_dist,
+                distance_fun=distance_fun,
+                process_num=process_num,
+            )
+        )
 
 
 class GPTSchemaMatcher(BaseSchemaMatcher):

diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py
@@ -1,4 +1,4 @@
-from typing import List, NamedTuple, Callable
+from typing import List, NamedTuple, Callable, Tuple
 import ast
 from openai import OpenAI
 from polyfuzz import PolyFuzz
@@ -68,22 +68,42 @@ class TFIDFValueMatcher(PolyFuzzValueMatcher):
     Value matching algorithm based on the TF-IDF similarity between values.
     """
 
-    def __init__(self):
-        super().__init__(PolyFuzz(method=TFIDF(n_gram_range=(1, 3), min_similarity=0)))
+    def __init__(
+        self,
+        n_gram_range: Tuple[int, int] = (1, 3),
+        clean_string: bool = True,
+        min_similarity: float = 0.0,
+        top_n: int = 1,
+        cosine_method: str = "sparse",
+    ):
+
+        super().__init__(
+            PolyFuzz(
+                method=TFIDF(
+                    n_gram_range=n_gram_range,
+                    clean_string=clean_string,
+                    min_similarity=min_similarity,
+                    top_n=top_n,
+                    cosine_method=cosine_method,
+                )
+            )
+        )
 
 
 class EditDistanceValueMatcher(PolyFuzzValueMatcher):
     """
     Value matching algorithm based on the edit distance between values.
     """
 
-    def __init__(self, scorer: Callable[[str, str], float] = fuzz.ratio):
+    def __init__(
+        self, scorer: Callable[[str, str], float] = fuzz.ratio, n_jobs: int = -1
+    ):
         # Return scores between 0 and 1
         normalized_scorer = lambda str1, str2: scorer(str1, str2) / 100.0
         super().__init__(
             PolyFuzz(
                 method=EditDistance(
-                    n_jobs=-1, scorer=normalized_scorer, normalize=False
+                    n_jobs=n_jobs, scorer=normalized_scorer, normalize=False
                 )
             )
         )
@@ -94,9 +114,20 @@ class EmbeddingValueMatcher(PolyFuzzValueMatcher):
     Value matching algorithm based on the cosine similarity of value embeddings.
     """
 
-    def __init__(self, model_path: str = "bert-base-multilingual-cased"):
+    def __init__(
+        self,
+        model_path: str = "bert-base-multilingual-cased",
+        min_similarity: float = 0.0,
+        top_n: int = 1,
+        cosine_method: str = "sparse",
+    ):
         embeddings = TransformerWordEmbeddings(model_path)
-        method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model")
+        method = Embeddings(
+            embeddings,
+            min_similarity=min_similarity,
+            top_n=top_n,
+            cosine_method=cosine_method,
+        )
         super().__init__(PolyFuzz(method))
 
 
@@ -105,9 +136,20 @@ class FastTextAlgorithm(PolyFuzzValueMatcher):
     Value matching algorithm based on the cosine similarity of FastText embeddings.
     """
 
-    def __init__(self, model_name: str = "en-crawl"):
+    def __init__(
+        self,
+        model_name: str = "en-crawl",
+        min_similarity: float = 0.0,
+        top_n: int = 1,
+        cosine_method: str = "sparse",
+    ):
         embeddings = WordEmbeddings(model_name)
-        method = Embeddings(embeddings, min_similarity=0)
+        method = Embeddings(
+            embeddings,
+            min_similarity=min_similarity,
+            top_n=top_n,
+            cosine_method=cosine_method,
+        )
         super().__init__(PolyFuzz(method))