From 20ef9459f1638184618d4de67743adf04344a312 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Thu, 25 Jul 2024 15:39:58 -0400
Subject: [PATCH] WIP

---
 .../column_mapping/algorithms.py              | 45 ++++++++++++++++++-
 .../value_mapping/algorithms.py               |  6 +--
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/bdikit/mapping_algorithms/column_mapping/algorithms.py b/bdikit/mapping_algorithms/column_mapping/algorithms.py
index ed32c19a..65298c6b 100644
--- a/bdikit/mapping_algorithms/column_mapping/algorithms.py
+++ b/bdikit/mapping_algorithms/column_mapping/algorithms.py
@@ -19,6 +19,9 @@
     TopkColumnMatcher,
     CLTopkColumnMatcher,
 )
+from bdikit.mapping_algorithms.value_mapping.algorithms import (
+    TFIDFValueMatcher,
+)
 
 
 class BaseSchemaMatcher:
@@ -126,6 +129,46 @@ def __init__(
         )
 
 
+class MaxValSimSchemaMatcher(BaseSchemaMatcher):
+    def unique_string_values(self, column: pd.Series) -> pd.Series:
+        if pd.api.types.is_string_dtype(column):
+            return pd.Series(column.unique(), name=column.name)
+        else:
+            return pd.Series(column.unique().astype(str), name=column.name)
+
+    def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]:
+        schema_mapping = {}
+        for source_column_name in dataset.columns:
+            scores = []
+
+            source_column = dataset[source_column_name]
+            source_values = self.unique_string_values(source_column).to_list()
+
+            print("\nSource column: ", source_column_name)
+            for target_column_name in global_table.columns:
+
+                target_column = global_table[target_column_name]
+                target_values = self.unique_string_values(target_column).to_list()
+
+                value_matcher = TFIDFValueMatcher()
+                value_matches = value_matcher.match(source_values, target_values)
+
+                score = sum([m.similarity for m in value_matches])
+                print("value_matches: ", value_matches)
+                print(
+                    f"source: {source_column_name} target: {target_column_name} score: {score}"
+                )
+
+                scores.append((source_column_name, target_column_name, score))
+
+            sorted_columns = sorted(scores, key=lambda it: it[2], reverse=True)
+            print("Top k reranked columns: ", sorted_columns)
+
+            schema_mapping[source_column_name] = sorted_columns[0][1]
+
+        return schema_mapping
+
+
 class GPTSchemaMatcher(BaseSchemaMatcher):
     def __init__(self):
         self.client = OpenAI()
@@ -196,7 +239,7 @@ def __init__(
         self,
         top_k: int = 20,
         top_k_matcher: Optional[TopkColumnMatcher] = None,
-        schema_matcher: BaseSchemaMatcher = SimFloodSchemaMatcher(),
+        schema_matcher: BaseSchemaMatcher = MaxValSimSchemaMatcher(),
     ):
         if top_k_matcher is None:
             self.api = CLTopkColumnMatcher(DEFAULT_CL_MODEL)
diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py
index 3c414662..2177506f 100644
--- a/bdikit/mapping_algorithms/value_mapping/algorithms.py
+++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py
@@ -63,7 +63,7 @@ def match(
             target_value = row["To"]
             similarity = row["Similarity"]
             if similarity >= self.threshold:
-                matches.append((current_value, target_value, similarity))
+                matches.append(ValueMatch(current_value, target_value, similarity))
 
         return matches
 
@@ -204,7 +204,7 @@ def match(
                 target_value = response_dict["term"]
                 score = float(response_dict["score"])
                 if target_value in target_values_set and score >= self.threshold:
-                    matches.append((current_value, target_value, score))
+                    matches.append(ValueMatch(current_value, target_value, score))
             except:
                 print(
                     f'Errors parsing response for "{current_value}": {response_message}'
@@ -250,7 +250,7 @@ def match(
                     title_r = row["title_r"]
                     similarity = ratio(title_l, title_r)
                     if similarity >= self.threshold:
-                        matches.append((title_l, title_r, similarity))
+                        matches.append(ValueMatch(title_l, title_r, similarity))
         except Exception as e:
             return matches
         return matches