From 20ef9459f1638184618d4de67743adf04344a312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Thu, 25 Jul 2024 15:39:58 -0400 Subject: [PATCH] WIP --- .../column_mapping/algorithms.py | 45 ++++++++++++++++++- .../value_mapping/algorithms.py | 6 +-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py index ed32c19a..65298c6b 100644 --- a/bdikit/mapping_algorithms/column_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/column_mapping/algorithms.py @@ -19,6 +19,9 @@ TopkColumnMatcher, CLTopkColumnMatcher, ) +from bdikit.mapping_algorithms.value_mapping.algorithms import ( + TFIDFValueMatcher, +) class BaseSchemaMatcher: @@ -126,6 +129,46 @@ def __init__( ) +class MaxValSimSchemaMatcher(BaseSchemaMatcher): + def unique_string_values(self, column: pd.Series) -> pd.Series: + if pd.api.types.is_string_dtype(column): + return pd.Series(column.unique(), name=column.name) + else: + return pd.Series(column.unique().astype(str), name=column.name) + + def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]: + schema_mapping = {} + for source_column_name in dataset.columns: + scores = [] + + source_column = dataset[source_column_name] + source_values = self.unique_string_values(source_column).to_list() + + print("\nSource column: ", source_column_name) + for target_column_name in global_table.columns: + + target_column = global_table[target_column_name] + target_values = self.unique_string_values(target_column).to_list() + + value_matcher = TFIDFValueMatcher() + value_matches = value_matcher.match(source_values, target_values) + + score = sum([m.similarity for m in value_matches]) + print("value_matches: ", value_matches) + print( + f"source: {source_column_name} target: {target_column_name} score: {score}" + ) + + scores.append((source_column_name, target_column_name, score)) + + sorted_columns = sorted(scores, key=lambda it: it[2], reverse=True) + print("Top k reranked columns: ", sorted_columns) + + schema_mapping[source_column_name] = sorted_columns[0][1] + + return schema_mapping + + class GPTSchemaMatcher(BaseSchemaMatcher): def __init__(self): self.client = OpenAI() @@ -196,7 +239,7 @@ def __init__( self, top_k: int = 20, top_k_matcher: Optional[TopkColumnMatcher] = None, - schema_matcher: BaseSchemaMatcher = SimFloodSchemaMatcher(), + schema_matcher: BaseSchemaMatcher = MaxValSimSchemaMatcher(), ): if top_k_matcher is None: self.api = CLTopkColumnMatcher(DEFAULT_CL_MODEL) diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py index 3c414662..2177506f 100644 --- a/bdikit/mapping_algorithms/value_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py @@ -63,7 +63,7 @@ def match( target_value = row["To"] similarity = row["Similarity"] if similarity >= self.threshold: - matches.append((current_value, target_value, similarity)) + matches.append(ValueMatch(current_value, target_value, similarity)) return matches @@ -204,7 +204,7 @@ def match( target_value = response_dict["term"] score = float(response_dict["score"]) if target_value in target_values_set and score >= self.threshold: - matches.append((current_value, target_value, score)) + matches.append(ValueMatch(current_value, target_value, score)) except: print( f'Errors parsing response for "{current_value}": {response_message}' @@ -250,7 +250,7 @@ def match( title_r = row["title_r"] similarity = ratio(title_l, title_r) if similarity >= self.threshold: - matches.append((title_l, title_r, similarity)) + matches.append(ValueMatch(title_l, title_r, similarity)) except Exception as e: return matches return matches