Skip to content

Commit

Permalink
Refactoring of column mappers to remove code duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
aecio committed May 31, 2024
1 parent 9a42107 commit 50a99ec
Showing 1 changed file with 23 additions and 57 deletions.
80 changes: 23 additions & 57 deletions bdikit/mapping_algorithms/column_mapping/algorithms.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import pandas as pd
from typing import Dict
from valentine import valentine_match
from valentine.algorithms import (
SimilarityFlooding,
Coma,
Cupid,
DistributionBased,
JaccardDistanceMatcher,
BaseMatcher,
)
from valentine.algorithms.matcher_results import MatcherResults
from openai import OpenAI


Expand All @@ -14,18 +18,19 @@ def __init__(self, dataset, global_table):
self._dataset = dataset
self._global_table = global_table

def map(self):
def map(self) -> Dict[str, str]:
raise NotImplementedError("Subclasses must implement this method")


class SimFloodAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
class ValentineColumnMappingAlgorithm(BaseColumnMappingAlgorithm):
def __init__(self, dataset, global_table, matcher: BaseMatcher):
super().__init__(dataset, global_table)
self.matcher = matcher

def map(self):
matcher = SimilarityFlooding()
matches = valentine_match(self._dataset, self._global_table, matcher)

def map(self) -> Dict[str, str]:
matches: MatcherResults = valentine_match(
self._dataset, self._global_table, self.matcher
)
mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
Expand All @@ -34,68 +39,29 @@ def map(self):
return mappings


class ComaAlgorithm(BaseColumnMappingAlgorithm):
class SimFloodAlgorithm(ValentineColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = Coma()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
super().__init__(dataset, global_table, SimilarityFlooding())


class CupidAlgorithm(BaseColumnMappingAlgorithm):
class ComaAlgorithm(ValentineColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = Cupid()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
super().__init__(dataset, global_table, Coma())


class DistributionBasedAlgorithm(BaseColumnMappingAlgorithm):
class CupidAlgorithm(ValentineColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)

def map(self):
matcher = DistributionBased()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
super().__init__(dataset, global_table, Cupid())


class JaccardDistanceAlgorithm(BaseColumnMappingAlgorithm):
class DistributionBasedAlgorithm(ValentineColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table)
super().__init__(dataset, global_table, DistributionBased())

def map(self):
matcher = JaccardDistanceMatcher()
matches = valentine_match(self._dataset, self._global_table, matcher)

mappings = {}
for match in matches.one_to_one():
dataset_candidate = match[0][1]
global_table_candidate = match[1][1]
mappings[dataset_candidate] = global_table_candidate
return mappings
class JaccardDistanceAlgorithm(ValentineColumnMappingAlgorithm):
def __init__(self, dataset, global_table):
super().__init__(dataset, global_table, JaccardDistanceMatcher())


class GPTAlgorithm(BaseColumnMappingAlgorithm):
Expand Down

0 comments on commit 50a99ec

Please sign in to comment.