Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jul 26, 2024
1 parent 33b2a6e commit a4ed444
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 4 deletions.
45 changes: 44 additions & 1 deletion bdikit/mapping_algorithms/column_mapping/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
TopkColumnMatcher,
CLTopkColumnMatcher,
)
from bdikit.mapping_algorithms.value_mapping.algorithms import (
TFIDFValueMatcher,
)


class BaseSchemaMatcher:
Expand Down Expand Up @@ -126,6 +129,46 @@ def __init__(
)


class MaxValSimSchemaMatcher(BaseSchemaMatcher):
def unique_string_values(self, column: pd.Series) -> pd.Series:
if pd.api.types.is_string_dtype(column):
return pd.Series(column.unique(), name=column.name)
else:
return pd.Series(column.unique().astype(str), name=column.name)

def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame) -> Dict[str, str]:
schema_mapping = {}
for source_column_name in dataset.columns:
scores = []

source_column = dataset[source_column_name]
source_values = self.unique_string_values(source_column).to_list()

print("\nSource column: ", source_column_name)
for target_column_name in global_table.columns:

target_column = global_table[target_column_name]
target_values = self.unique_string_values(target_column).to_list()

value_matcher = TFIDFValueMatcher()
value_matches = value_matcher.match(source_values, target_values)

score = sum([m.similarity for m in value_matches])
print("value_matches: ", value_matches)
print(
f"source: {source_column_name} target: {target_column_name} score: {score}"
)

scores.append((source_column_name, target_column_name, score))

sorted_columns = sorted(scores, key=lambda it: it[2], reverse=True)
print("Top k reranked columns: ", sorted_columns)

schema_mapping[source_column_name] = sorted_columns[0][1]

return schema_mapping


class GPTSchemaMatcher(BaseSchemaMatcher):
def __init__(self):
self.client = OpenAI()
Expand Down Expand Up @@ -196,7 +239,7 @@ def __init__(
self,
top_k: int = 20,
top_k_matcher: Optional[TopkColumnMatcher] = None,
schema_matcher: BaseSchemaMatcher = SimFloodSchemaMatcher(),
schema_matcher: BaseSchemaMatcher = MaxValSimSchemaMatcher(),
):
if top_k_matcher is None:
self.api = CLTopkColumnMatcher(DEFAULT_CL_MODEL)
Expand Down
6 changes: 3 additions & 3 deletions bdikit/mapping_algorithms/value_mapping/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def match(
target_value = row["To"]
similarity = row["Similarity"]
if similarity >= self.threshold:
matches.append((source_value, target_value, similarity))
matches.append(ValueMatch(source_value, target_value, similarity))

return matches

Expand Down Expand Up @@ -204,7 +204,7 @@ def match(
target_value = response_dict["term"]
score = float(response_dict["score"])
if target_value in target_values_set and score >= self.threshold:
matches.append((source_value, target_value, score))
matches.append(ValueMatch(source_value, target_value, score))
except:
print(
f'Errors parsing response for "{source_value}": {response_message}'
Expand Down Expand Up @@ -250,7 +250,7 @@ def match(
title_r = row["title_r"]
similarity = ratio(title_l, title_r)
if similarity >= self.threshold:
matches.append((title_l, title_r, similarity))
matches.append(ValueMatch(title_l, title_r, similarity))
except Exception as e:
return matches
return matches

0 comments on commit a4ed444

Please sign in to comment.