From 0f187ab3acfe58259ceecf6f50045831a1f63601 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?A=C3=A9cio=20Santos?= Date: Fri, 12 Jul 2024 15:22:33 -0400 Subject: [PATCH] refactor: Renamed value matcher classes to use suffix 'ValueMatcher' This makes value matchers more consistent with other operations such as schema matching (SchemaMatcher's) and top-k column matching (TopkColumnMatcher). --- bdikit/api.py | 40 +++++++++---------- .../value_mapping/algorithms.py | 16 ++++---- docs/source/api.rst | 2 +- tests/test_value_matching_algorithms.py | 10 ++--- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/bdikit/api.py b/bdikit/api.py index 2db3ab28..465d7b97 100644 --- a/bdikit/api.py +++ b/bdikit/api.py @@ -26,13 +26,13 @@ ) from bdikit.mapping_algorithms.value_mapping.algorithms import ( ValueMatch, - BaseAlgorithm, - TFIDFAlgorithm, - LLMAlgorithm, - EditAlgorithm, - EmbeddingAlgorithm, - AutoFuzzyJoinAlgorithm, - FastTextAlgorithm, + BaseValueMatcher, + TFIDFValueMatcher, + GPTValueMatcher, + EditDistanceValueMatcher, + EmbeddingValueMatcher, + AutoFuzzyJoinValueMatcher, + FastTextValueMatcher, ) from bdikit.mapping_algorithms.value_mapping.value_mappers import ( ValueMapper, @@ -170,23 +170,21 @@ def top_matches( return pd.concat(dfs, ignore_index=True) -class ValueMatchingMethod(Enum): - TFIDF = ("tfidf", TFIDFAlgorithm) - EDIT = ("edit_distance", EditAlgorithm) - EMBEDDINGS = ("embedding", EmbeddingAlgorithm) - AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinAlgorithm) - FASTTEXT = ("fasttext", FastTextAlgorithm) - GPT = ("gpt", LLMAlgorithm) +class ValueMatchers(Enum): + TFIDF = ("tfidf", TFIDFValueMatcher) + EDIT = ("edit_distance", EditDistanceValueMatcher) + EMBEDDINGS = ("embedding", EmbeddingValueMatcher) + AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher) + FASTTEXT = ("fasttext", FastTextValueMatcher) + GPT = ("gpt", GPTValueMatcher) - def __init__(self, method_name: str, method_class: Type[BaseAlgorithm]): + def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]): self.method_name = method_name self.method_class = method_class @staticmethod - def get_instance(method_name: str) -> BaseAlgorithm: - methods = { - method.method_name: method.method_class for method in ValueMatchingMethod - } + def get_instance(method_name: str) -> BaseValueMatcher: + methods = {method.method_name: method.method_class for method in ValueMatchers} try: return methods[method_name]() except KeyError: @@ -326,7 +324,7 @@ def match_values( "The target must be a DataFrame or a standard vocabulary name." ) - value_matcher = ValueMatchingMethod.get_instance(method) + value_matcher = ValueMatchers.get_instance(method) matches = _match_values(source, target_domain, column_mapping_dict, value_matcher) return matches @@ -335,7 +333,7 @@ def _match_values( dataset: pd.DataFrame, target_domain: Dict[str, Optional[List[str]]], column_mapping: Dict[str, str], - value_matcher: BaseAlgorithm, + value_matcher: BaseValueMatcher, ) -> List[ValueMatchingResult]: mapping_results: List[ValueMatchingResult] = [] diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py index dd77708d..c7cd55f8 100644 --- a/bdikit/mapping_algorithms/value_mapping/algorithms.py +++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py @@ -26,7 +26,7 @@ class ValueMatch(NamedTuple): similarity: float -class BaseAlgorithm: +class BaseValueMatcher: """ Base class for value matching algorithms, i.e., algorithms that match values from a source (current) domain to values from a target domain. @@ -38,7 +38,7 @@ def match( raise NotImplementedError("Subclasses must implement this method") -class PolyFuzzAlgorithm(BaseAlgorithm): +class PolyFuzzValueMatcher(BaseValueMatcher): """ Base class for value matching algorithms based on the PolyFuzz library. """ @@ -68,7 +68,7 @@ def match( return matches -class TFIDFAlgorithm(PolyFuzzAlgorithm): +class TFIDFValueMatcher(PolyFuzzValueMatcher): """ Value matching algorithm based on the TF-IDF similarity between values. """ @@ -77,7 +77,7 @@ def __init__(self): super().__init__(PolyFuzz(method=TFIDF(n_gram_range=(1, 3), min_similarity=0))) -class EditAlgorithm(PolyFuzzAlgorithm): +class EditDistanceValueMatcher(PolyFuzzValueMatcher): """ Value matching algorithm based on the edit distance between values. """ @@ -94,7 +94,7 @@ def __init__(self, scorer: Callable[[str, str], float] = fuzz.ratio): ) -class EmbeddingAlgorithm(PolyFuzzAlgorithm): +class EmbeddingValueMatcher(PolyFuzzValueMatcher): """ Value matching algorithm based on the cosine similarity of value embeddings. """ @@ -105,7 +105,7 @@ def __init__(self, model_path: str = "bert-base-multilingual-cased"): super().__init__(PolyFuzz(method)) -class FastTextAlgorithm(PolyFuzzAlgorithm): +class FastTextValueMatcher(PolyFuzzValueMatcher): """ Value matching algorithm based on the cosine similarity of FastText embeddings. """ @@ -116,7 +116,7 @@ def __init__(self, model_name: str = "en-crawl"): super().__init__(PolyFuzz(method)) -class LLMAlgorithm(BaseAlgorithm): +class GPTValueMatcher(BaseValueMatcher): def __init__(self): self.client = OpenAI() @@ -163,7 +163,7 @@ def match( return matches -class AutoFuzzyJoinAlgorithm(BaseAlgorithm): +class AutoFuzzyJoinValueMatcher(BaseValueMatcher): def __init__(self): pass diff --git a/docs/source/api.rst b/docs/source/api.rst index 48cf7883..ffd584d4 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -4,4 +4,4 @@ API .. automodule:: bdikit.api :members: - :exclude-members: SchemaMatchers, ValueMatchingMethod, ValueMatchingResult + :exclude-members: SchemaMatchers, ValueMatchers, ValueMatchingResult diff --git a/tests/test_value_matching_algorithms.py b/tests/test_value_matching_algorithms.py index ee4e6820..6d9ae5d5 100644 --- a/tests/test_value_matching_algorithms.py +++ b/tests/test_value_matching_algorithms.py @@ -1,19 +1,19 @@ import unittest import pandas as pd from bdikit.mapping_algorithms.value_mapping.algorithms import ( - TFIDFAlgorithm, - EditAlgorithm, + TFIDFValueMatcher, + EditDistanceValueMatcher, ) -class ValueMatchingAlgorithmsTest(unittest.TestCase): +class ValueMatchingTest(unittest.TestCase): def test_tfidf_value_matching(self): # given current_values = ["Red Apple", "Banana", "Oorange", "Strawberry"] target_values = ["apple", "banana", "orange", "kiwi"] - tfidf_matcher = TFIDFAlgorithm() + tfidf_matcher = TFIDFValueMatcher() # when matches = tfidf_matcher.match(current_values, target_values) @@ -35,7 +35,7 @@ def test_edit_distance_value_matching(self): current_values = ["Red Apple", "Banana", "Oorange", "Strawberry"] target_values = ["apple", "bananana", "orange", "kiwi"] - edit_distance_matcher = EditAlgorithm() + edit_distance_matcher = EditDistanceValueMatcher() # when matches = edit_distance_matcher.match(