diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py index 417b7d81..3560db2a 100644 --- a/bdikit/mapping_algorithms/column_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/column_mapping/algorithms.py @@ -1,5 +1,5 @@ import pandas as pd -from typing import Dict, Optional +from typing import Dict, Optional, Callable from valentine import valentine_match from valentine.algorithms import ( SimilarityFlooding, @@ -10,6 +10,7 @@ BaseMatcher, ) from valentine.algorithms.matcher_results import MatcherResults +from valentine.algorithms.jaccard_distance import StringDistanceFunction from openai import OpenAI from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import ( DEFAULT_CL_MODEL, @@ -48,28 +49,81 @@ def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, st class SimFloodSchemaMatcher(ValentineSchemaMatcher): - def __init__(self): - super().__init__(SimilarityFlooding()) + def __init__( + self, coeff_policy: str = "inverse_average", formula: str = "formula_c" + ): + super().__init__(SimilarityFlooding(coeff_policy=coeff_policy, formula=formula)) class ComaSchemaMatcher(ValentineSchemaMatcher): - def __init__(self): - super().__init__(Coma()) + def __init__( + self, max_n: int = 0, use_instances: bool = False, java_xmx: str = "1024m" + ): + super().__init__( + Coma(max_n=max_n, use_instances=use_instances, java_xmx=java_xmx) + ) class CupidSchemaMatcher(ValentineSchemaMatcher): - def __init__(self): - super().__init__(Cupid()) + def __init__( + self, + leaf_w_struct: float = 0.2, + w_struct: float = 0.2, + th_accept: float = 0.7, + th_high: float = 0.6, + th_low: float = 0.35, + c_inc: float = 1.2, + c_dec: float = 0.9, + th_ns: float = 0.7, + parallelism: int = 1, + ): + super().__init__( + Cupid( + leaf_w_struct=leaf_w_struct, + w_struct=w_struct, + th_accept=th_accept, + th_high=th_high, + th_low=th_low, + c_inc=c_inc, + c_dec=c_dec, + th_ns=th_ns, + parallelism=parallelism, + ) + ) class DistributionBasedSchemaMatcher(ValentineSchemaMatcher): - def __init__(self): - super().__init__(DistributionBased()) + def __init__( + self, + threshold1: float = 0.15, + threshold2: float = 0.15, + quantiles: int = 256, + process_num: int = 1, + ): + super().__init__( + DistributionBased( + threshold1=threshold1, + threshold2=threshold2, + quantiles=quantiles, + process_num=process_num, + ) + ) class JaccardSchemaMatcher(ValentineSchemaMatcher): - def __init__(self): - super().__init__(JaccardDistanceMatcher()) + def __init__( + self, + threshold_dist: float = 0.8, + distance_fun: Callable[[str, str], float] = StringDistanceFunction.Levenshtein, + process_num: int = 1, + ): + super().__init__( + JaccardDistanceMatcher( + threshold_dist=threshold_dist, + distance_fun=distance_fun, + process_num=process_num, + ) + ) class GPTSchemaMatcher(BaseSchemaMatcher): diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py index df8af7cb..f225e1a0 100644 --- a/bdikit/mapping_algorithms/value_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py @@ -1,4 +1,4 @@ -from typing import List, NamedTuple, Callable +from typing import List, NamedTuple, Callable, Tuple import ast from openai import OpenAI from polyfuzz import PolyFuzz @@ -68,8 +68,26 @@ class TFIDFValueMatcher(PolyFuzzValueMatcher): Value matching algorithm based on the TF-IDF similarity between values. """ - def __init__(self): - super().__init__(PolyFuzz(method=TFIDF(n_gram_range=(1, 3), min_similarity=0))) + def __init__( + self, + n_gram_range: Tuple[int, int] = (1, 3), + clean_string: bool = True, + min_similarity: float = 0.0, + top_n: int = 1, + cosine_method: str = "sparse", + ): + + super().__init__( + PolyFuzz( + method=TFIDF( + n_gram_range=n_gram_range, + clean_string=clean_string, + min_similarity=min_similarity, + top_n=top_n, + cosine_method=cosine_method, + ) + ) + ) class EditDistanceValueMatcher(PolyFuzzValueMatcher): @@ -77,13 +95,15 @@ class EditDistanceValueMatcher(PolyFuzzValueMatcher): Value matching algorithm based on the edit distance between values. """ - def __init__(self, scorer: Callable[[str, str], float] = fuzz.ratio): + def __init__( + self, scorer: Callable[[str, str], float] = fuzz.ratio, n_jobs: int = -1 + ): # Return scores between 0 and 1 normalized_scorer = lambda str1, str2: scorer(str1, str2) / 100.0 super().__init__( PolyFuzz( method=EditDistance( - n_jobs=-1, scorer=normalized_scorer, normalize=False + n_jobs=n_jobs, scorer=normalized_scorer, normalize=False ) ) ) @@ -94,9 +114,20 @@ class EmbeddingValueMatcher(PolyFuzzValueMatcher): Value matching algorithm based on the cosine similarity of value embeddings. """ - def __init__(self, model_path: str = "bert-base-multilingual-cased"): + def __init__( + self, + model_path: str = "bert-base-multilingual-cased", + min_similarity: float = 0.0, + top_n: int = 1, + cosine_method: str = "sparse", + ): embeddings = TransformerWordEmbeddings(model_path) - method = Embeddings(embeddings, min_similarity=0, model_id="embedding_model") + method = Embeddings( + embeddings, + min_similarity=min_similarity, + top_n=top_n, + cosine_method=cosine_method, + ) super().__init__(PolyFuzz(method)) @@ -105,9 +136,20 @@ class FastTextAlgorithm(PolyFuzzValueMatcher): Value matching algorithm based on the cosine similarity of FastText embeddings. """ - def __init__(self, model_name: str = "en-crawl"): + def __init__( + self, + model_name: str = "en-crawl", + min_similarity: float = 0.0, + top_n: int = 1, + cosine_method: str = "sparse", + ): embeddings = WordEmbeddings(model_name) - method = Embeddings(embeddings, min_similarity=0) + method = Embeddings( + embeddings, + min_similarity=min_similarity, + top_n=top_n, + cosine_method=cosine_method, + ) super().__init__(PolyFuzz(method))