Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed Jul 18, 2024
1 parent 57cf788 commit 7b395c8
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 114 deletions.
4 changes: 4 additions & 0 deletions bdikit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
GPTSchemaMatcher,
ContrastiveLearningSchemaMatcher,
TwoPhaseSchemaMatcher,
SpladeSchemaMatcher,
)
from bdikit.mapping_algorithms.value_mapping.value_mappers import ValueMapper
from bdikit.mapping_algorithms.scope_reducing._algorithms.contrastive_learning.cl_api import (
Expand All @@ -33,6 +34,7 @@
EmbeddingValueMatcher,
AutoFuzzyJoinValueMatcher,
FastTextValueMatcher,
SpladeValueMatcher,
)
from bdikit.mapping_algorithms.value_mapping.value_mappers import (
ValueMapper,
Expand All @@ -55,6 +57,7 @@ class SchemaMatchers(Enum):
JACCARD_DISTANCE = ("jaccard_distance", JaccardSchemaMatcher)
GPT = ("gpt", GPTSchemaMatcher)
CT_LEARGNING = ("ct_learning", ContrastiveLearningSchemaMatcher)
SPLADE = ("splade", SpladeSchemaMatcher)
TWO_PHASE = ("two_phase", TwoPhaseSchemaMatcher)

def __init__(self, method_name: str, method_class: Type[BaseSchemaMatcher]):
Expand Down Expand Up @@ -177,6 +180,7 @@ class ValueMatchers(Enum):
AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher)
FASTTEXT = ("fasttext", FastTextValueMatcher)
GPT = ("gpt", GPTValueMatcher)
SPLADE = ("splade", SpladeValueMatcher)

def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
self.method_name = method_name
Expand Down
17 changes: 17 additions & 0 deletions bdikit/mapping_algorithms/column_mapping/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from bdikit.mapping_algorithms.column_mapping.topk_matchers import (
TopkColumnMatcher,
CLTopkColumnMatcher,
SpladeTopkColumnMatcher,
)


Expand Down Expand Up @@ -137,6 +138,22 @@ def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame):
return self._fill_missing_matches(dataset, matches)


class SpladeSchemaMatcher(BaseSchemaMatcher):
def __init__(self, model_name: str = "naver/splade-cocondenser-ensembledistil"):
self.topk_matcher = SpladeTopkColumnMatcher(model_name=model_name)

def map(self, dataset: pd.DataFrame, global_table: pd.DataFrame):
topk_matches = self.topk_matcher.get_recommendations(
dataset, global_table, top_k=1
)
matches = {}
for column, top_k_match in zip(dataset.columns, topk_matches):
candidate = top_k_match["top_k_columns"][0][0]
if candidate in global_table.columns:
matches[column] = candidate
return self._fill_missing_matches(dataset, matches)


class TwoPhaseSchemaMatcher(BaseSchemaMatcher):

def __init__(
Expand Down
19 changes: 14 additions & 5 deletions bdikit/mapping_algorithms/column_mapping/topk_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ContrastiveLearningAPI,
DEFAULT_CL_MODEL,
)
from bdikit.models.splade import SpladeEmbedder


class ColumnScore(NamedTuple):
Expand All @@ -26,11 +27,9 @@ def get_recommendations(
pass


class CLTopkColumnMatcher(TopkColumnMatcher):
def __init__(self, model_name: str = DEFAULT_CL_MODEL):
# TODO: we can generalize this api to accept any embedding model
# and not just our contrastive learning model
self.api = ContrastiveLearningAPI(model_name=model_name)
class EmbeddingSimTopkColumnMatcher(TopkColumnMatcher):
def __init__(self, column_embedder):
self.api = column_embedder

def get_recommendations(
self, source: pd.DataFrame, target: pd.DataFrame, top_k: int = 10
Expand Down Expand Up @@ -59,3 +58,13 @@ def get_recommendations(
)

return top_k_results


class CLTopkColumnMatcher(EmbeddingSimTopkColumnMatcher):
def __init__(self, model_name: str = DEFAULT_CL_MODEL):
super().__init__(column_embedder=ContrastiveLearningAPI(model_name=model_name))


class SpladeTopkColumnMatcher(EmbeddingSimTopkColumnMatcher):
def __init__(self, model_name: str = "naver/splade-cocondenser-ensembledistil"):
super().__init__(column_embedder=SpladeEmbedder(model_id=model_name))
38 changes: 10 additions & 28 deletions bdikit/mapping_algorithms/value_mapping/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
from autofj import AutoFJ
from Levenshtein import ratio
import pandas as pd
import numpy as np
import flair
import torch
from sklearn.metrics.pairwise import cosine_similarity
from bdikit.config import get_device
from bdikit.models.splade import SpladeEmbedder


flair.device = torch.device(get_device())

Expand Down Expand Up @@ -205,12 +209,6 @@ def match(
return matches


from typing import List, Dict
import numpy as np
from bdikit.models.splade import SpladeEmbedder
from sklearn.metrics.pairwise import cosine_similarity


class SpladeValueMatcher(BaseValueMatcher):
def __init__(self, model_id: str = "naver/splade-cocondenser-ensembledistil"):
self.splade_embedder = SpladeEmbedder(model_id)
Expand All @@ -219,44 +217,28 @@ def match(
self,
current_values: List[str],
target_values: List[str],
threshold: float = 0.8,
threshold: float = 0.25,
) -> List[ValueMatch]:

source_embs = self.splade_embedder.embed_values(current_values)
target_embs = self.splade_embedder.embed_values(target_values)
matches = []

def to_dense_vec(sparse_vec: Dict, dim: int = 30522):
x = np.zeros(dim)
for idx, val in zip(sparse_vec["indexes"], sparse_vec["weights"]):
x[idx] = val
return x

print()

l_source_embeddings = []
l_source_values = []
for value_text, sparse_vec in source_embs["values"].items():
l_source_embeddings.append(to_dense_vec(sparse_vec))
for value_text, value_vector in source_embs["values"].items():
l_source_embeddings.append(value_vector)
l_source_values.append(value_text)

r_target_embeddings = []
r_target_value = []
for value_text, sparse_vec in target_embs["values"].items():
r_target_embeddings.append(to_dense_vec(sparse_vec))
for value_text, value_vector in target_embs["values"].items():
r_target_embeddings.append(value_vector)
r_target_value.append(value_text)

print("l_source_values: ", l_source_values)
print("r_target_value: ", r_target_value)

cosine_sim = cosine_similarity(l_source_embeddings, r_target_embeddings) # type: ignore
print("cosine_sim: ", cosine_sim)

matches = []
for index, similarities in enumerate(cosine_sim):
print("similarities: ", similarities)
print("argmax:", np.argmax(similarities))
print("source: ", l_source_values[index])
print("target: ", r_target_value[np.argmax(similarities)])
similarity = similarities[np.argmax(similarities)]
source_value = l_source_values[index]
target_value = r_target_value[np.argmax(similarities)]
Expand Down
Loading

0 comments on commit 7b395c8

Please sign in to comment.