Skip to content

Commit

Permalink
Expose paramters of the matching algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez committed Jul 17, 2024
1 parent 26c9cf5 commit 828f6a9
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 20 deletions.
76 changes: 65 additions & 11 deletions bdikit/mapping_algorithms/column_mapping/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from typing import Dict, Optional
from typing import Dict, Optional, Callable
from valentine import valentine_match
from valentine.algorithms import (
SimilarityFlooding,
Expand All @@ -10,6 +10,7 @@
BaseMatcher,
)
from valentine.algorithms.matcher_results import MatcherResults
from valentine.algorithms.jaccard_distance import StringDistanceFunction
from openai import OpenAI
from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
DEFAULT_CL_MODEL,
Expand Down Expand Up @@ -48,28 +49,81 @@ def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, st


class SimFloodSchemaMatcher(ValentineSchemaMatcher):
def __init__(self):
super().__init__(SimilarityFlooding())
def __init__(
self, coeff_policy: str = "inverse_average", formula: str = "formula_c"
):
super().__init__(SimilarityFlooding(coeff_policy=coeff_policy, formula=formula))


class ComaSchemaMatcher(ValentineSchemaMatcher):
def __init__(self):
super().__init__(Coma())
def __init__(
self, max_n: int = 0, use_instances: bool = False, java_xmx: str = "1024m"
):
super().__init__(
Coma(max_n=max_n, use_instances=use_instances, java_xmx=java_xmx)
)


class CupidSchemaMatcher(ValentineSchemaMatcher):
def __init__(self):
super().__init__(Cupid())
def __init__(
self,
leaf_w_struct: float = 0.2,
w_struct: float = 0.2,
th_accept: float = 0.7,
th_high: float = 0.6,
th_low: float = 0.35,
c_inc: float = 1.2,
c_dec: float = 0.9,
th_ns: float = 0.7,
parallelism: int = 1,
):
super().__init__(
Cupid(
leaf_w_struct=leaf_w_struct,
w_struct=w_struct,
th_accept=th_accept,
th_high=th_high,
th_low=th_low,
c_inc=c_inc,
c_dec=c_dec,
th_ns=th_ns,
parallelism=parallelism,
)
)


class DistributionBasedSchemaMatcher(ValentineSchemaMatcher):
def __init__(self):
super().__init__(DistributionBased())
def __init__(
self,
threshold1: float = 0.15,
threshold2: float = 0.15,
quantiles: int = 256,
process_num: int = 1,
):
super().__init__(
DistributionBased(
threshold1=threshold1,
threshold2=threshold2,
quantiles=quantiles,
process_num=process_num,
)
)


class JaccardSchemaMatcher(ValentineSchemaMatcher):
def __init__(self):
super().__init__(JaccardDistanceMatcher())
def __init__(
self,
threshold_dist: float = 0.8,
distance_fun: Callable[[str, str], float] = StringDistanceFunction.Levenshtein,
process_num: int = 1,
):
super().__init__(
JaccardDistanceMatcher(
threshold_dist=threshold_dist,
distance_fun=distance_fun,
process_num=process_num,
)
)


class GPTSchemaMatcher(BaseSchemaMatcher):
Expand Down
60 changes: 51 additions & 9 deletions bdikit/mapping_algorithms/value_mapping/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, NamedTuple, Callable
from typing import List, NamedTuple, Callable, Tuple
import ast
from openai import OpenAI
from polyfuzz import PolyFuzz
Expand Down Expand Up @@ -68,22 +68,42 @@ class TFIDFValueMatcher(PolyFuzzValueMatcher):
Value matching algorithm based on the TF-IDF similarity between values.
"""

def __init__(self):
super().__init__(PolyFuzz(method=TFIDF(n_gram_range=(1, 3), min_similarity=0)))
def __init__(
self,
n_gram_range: Tuple[int, int] = (1, 3),
clean_string: bool = True,
min_similarity: float = 0.0,
top_n: int = 1,
cosine_method: str = "sparse",
):

super().__init__(
PolyFuzz(
method=TFIDF(
n_gram_range=n_gram_range,
clean_string=clean_string,
min_similarity=min_similarity,
top_n=top_n,
cosine_method=cosine_method,
)
)
)


class EditDistanceValueMatcher(PolyFuzzValueMatcher):
"""
Value matching algorithm based on the edit distance between values.
"""

def __init__(self, scorer: Callable[[str, str], float] = fuzz.ratio):
def __init__(
self, scorer: Callable[[str, str], float] = fuzz.ratio, n_jobs: int = -1
):
# Return scores between 0 and 1
normalized_scorer = lambda str1, str2: scorer(str1, str2) / 100.0
super().__init__(
PolyFuzz(
method=EditDistance(
n_jobs=-1, scorer=normalized_scorer, normalize=False
n_jobs=n_jobs, scorer=normalized_scorer, normalize=False
)
)
)
Expand All @@ -94,9 +114,20 @@ class EmbeddingValueMatcher(PolyFuzzValueMatcher):
Value matching algorithm based on the cosine similarity of value embeddings.
"""

def __init__(self, model_path: str = "bert-base-multilingual-cased"):
def __init__(
self,
model_path: str = "bert-base-multilingual-cased",
min_similarity: float = 0.0,
top_n: int = 1,
cosine_method: str = "sparse",
):
embeddings = TransformerWordEmbeddings(model_path)
method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model")
method = Embeddings(
embeddings,
min_similarity=min_similarity,
top_n=top_n,
cosine_method=cosine_method,
)
super().__init__(PolyFuzz(method))


Expand All @@ -105,9 +136,20 @@ class FastTextAlgorithm(PolyFuzzValueMatcher):
Value matching algorithm based on the cosine similarity of FastText embeddings.
"""

def __init__(self, model_name: str = "en-crawl"):
def __init__(
self,
model_name: str = "en-crawl",
min_similarity: float = 0.0,
top_n: int = 1,
cosine_method: str = "sparse",
):
embeddings = WordEmbeddings(model_name)
method = Embeddings(embeddings, min_similarity=0)
method = Embeddings(
embeddings,
min_similarity=min_similarity,
top_n=top_n,
cosine_method=cosine_method,
)
super().__init__(PolyFuzz(method))


Expand Down

0 comments on commit 828f6a9

Please sign in to comment.