From 0f187ab3acfe58259ceecf6f50045831a1f63601 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?A=C3=A9cio=20Santos?= <aecio.solando@gmail.com>
Date: Fri, 12 Jul 2024 15:22:33 -0400
Subject: [PATCH] refactor: Renamed value matcher classes to use suffix
 'ValueMatcher'

This makes value matchers more consistent with other operations
such as schema matching (SchemaMatcher's) and top-k column
matching (TopkColumnMatcher).
---
 bdikit/api.py                                 | 40 +++++++++----------
 .../value_mapping/algorithms.py               | 16 ++++----
 docs/source/api.rst                           |  2 +-
 tests/test_value_matching_algorithms.py       | 10 ++---
 4 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/bdikit/api.py b/bdikit/api.py
index 2db3ab28..465d7b97 100644
--- a/bdikit/api.py
+++ b/bdikit/api.py
@@ -26,13 +26,13 @@
 )
 from bdikit.mapping_algorithms.value_mapping.algorithms import (
     ValueMatch,
-    BaseAlgorithm,
-    TFIDFAlgorithm,
-    LLMAlgorithm,
-    EditAlgorithm,
-    EmbeddingAlgorithm,
-    AutoFuzzyJoinAlgorithm,
-    FastTextAlgorithm,
+    BaseValueMatcher,
+    TFIDFValueMatcher,
+    GPTValueMatcher,
+    EditDistanceValueMatcher,
+    EmbeddingValueMatcher,
+    AutoFuzzyJoinValueMatcher,
+    FastTextValueMatcher,
 )
 from bdikit.mapping_algorithms.value_mapping.value_mappers import (
     ValueMapper,
@@ -170,23 +170,21 @@ def top_matches(
     return pd.concat(dfs, ignore_index=True)
 
 
-class ValueMatchingMethod(Enum):
-    TFIDF = ("tfidf", TFIDFAlgorithm)
-    EDIT = ("edit_distance", EditAlgorithm)
-    EMBEDDINGS = ("embedding", EmbeddingAlgorithm)
-    AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinAlgorithm)
-    FASTTEXT = ("fasttext", FastTextAlgorithm)
-    GPT = ("gpt", LLMAlgorithm)
+class ValueMatchers(Enum):
+    TFIDF = ("tfidf", TFIDFValueMatcher)
+    EDIT = ("edit_distance", EditDistanceValueMatcher)
+    EMBEDDINGS = ("embedding", EmbeddingValueMatcher)
+    AUTOFJ = ("auto_fuzzy_join", AutoFuzzyJoinValueMatcher)
+    FASTTEXT = ("fasttext", FastTextValueMatcher)
+    GPT = ("gpt", GPTValueMatcher)
 
-    def __init__(self, method_name: str, method_class: Type[BaseAlgorithm]):
+    def __init__(self, method_name: str, method_class: Type[BaseValueMatcher]):
         self.method_name = method_name
         self.method_class = method_class
 
     @staticmethod
-    def get_instance(method_name: str) -> BaseAlgorithm:
-        methods = {
-            method.method_name: method.method_class for method in ValueMatchingMethod
-        }
+    def get_instance(method_name: str) -> BaseValueMatcher:
+        methods = {method.method_name: method.method_class for method in ValueMatchers}
         try:
             return methods[method_name]()
         except KeyError:
@@ -326,7 +324,7 @@ def match_values(
             "The target must be a DataFrame or a standard vocabulary name."
         )
 
-    value_matcher = ValueMatchingMethod.get_instance(method)
+    value_matcher = ValueMatchers.get_instance(method)
     matches = _match_values(source, target_domain, column_mapping_dict, value_matcher)
     return matches
 
@@ -335,7 +333,7 @@ def _match_values(
     dataset: pd.DataFrame,
     target_domain: Dict[str, Optional[List[str]]],
     column_mapping: Dict[str, str],
-    value_matcher: BaseAlgorithm,
+    value_matcher: BaseValueMatcher,
 ) -> List[ValueMatchingResult]:
 
     mapping_results: List[ValueMatchingResult] = []
diff --git a/bdikit/mapping_algorithms/value_mapping/algorithms.py b/bdikit/mapping_algorithms/value_mapping/algorithms.py
index dd77708d..c7cd55f8 100644
--- a/bdikit/mapping_algorithms/value_mapping/algorithms.py
+++ b/bdikit/mapping_algorithms/value_mapping/algorithms.py
@@ -26,7 +26,7 @@ class ValueMatch(NamedTuple):
     similarity: float
 
 
-class BaseAlgorithm:
+class BaseValueMatcher:
     """
     Base class for value matching algorithms, i.e., algorithms that match
     values from a source (current) domain to values from a target domain.
@@ -38,7 +38,7 @@ def match(
         raise NotImplementedError("Subclasses must implement this method")
 
 
-class PolyFuzzAlgorithm(BaseAlgorithm):
+class PolyFuzzValueMatcher(BaseValueMatcher):
     """
     Base class for value matching algorithms based on the PolyFuzz library.
     """
@@ -68,7 +68,7 @@ def match(
         return matches
 
 
-class TFIDFAlgorithm(PolyFuzzAlgorithm):
+class TFIDFValueMatcher(PolyFuzzValueMatcher):
     """
     Value matching algorithm based on the TF-IDF similarity between values.
     """
@@ -77,7 +77,7 @@ def __init__(self):
         super().__init__(PolyFuzz(method=TFIDF(n_gram_range=(1, 3), min_similarity=0)))
 
 
-class EditAlgorithm(PolyFuzzAlgorithm):
+class EditDistanceValueMatcher(PolyFuzzValueMatcher):
     """
     Value matching algorithm based on the edit distance between values.
     """
@@ -94,7 +94,7 @@ def __init__(self, scorer: Callable[[str, str], float] = fuzz.ratio):
         )
 
 
-class EmbeddingAlgorithm(PolyFuzzAlgorithm):
+class EmbeddingValueMatcher(PolyFuzzValueMatcher):
     """
     Value matching algorithm based on the cosine similarity of value embeddings.
     """
@@ -105,7 +105,7 @@ def __init__(self, model_path: str = "bert-base-multilingual-cased"):
         super().__init__(PolyFuzz(method))
 
 
-class FastTextAlgorithm(PolyFuzzAlgorithm):
+class FastTextValueMatcher(PolyFuzzValueMatcher):
     """
     Value matching algorithm based on the cosine similarity of FastText embeddings.
     """
@@ -116,7 +116,7 @@ def __init__(self, model_name: str = "en-crawl"):
         super().__init__(PolyFuzz(method))
 
 
-class LLMAlgorithm(BaseAlgorithm):
+class GPTValueMatcher(BaseValueMatcher):
     def __init__(self):
         self.client = OpenAI()
 
@@ -163,7 +163,7 @@ def match(
         return matches
 
 
-class AutoFuzzyJoinAlgorithm(BaseAlgorithm):
+class AutoFuzzyJoinValueMatcher(BaseValueMatcher):
 
     def __init__(self):
         pass
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 48cf7883..ffd584d4 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -4,4 +4,4 @@ API
 
 .. automodule:: bdikit.api
     :members:
-    :exclude-members: SchemaMatchers, ValueMatchingMethod, ValueMatchingResult
+    :exclude-members: SchemaMatchers, ValueMatchers, ValueMatchingResult
diff --git a/tests/test_value_matching_algorithms.py b/tests/test_value_matching_algorithms.py
index ee4e6820..6d9ae5d5 100644
--- a/tests/test_value_matching_algorithms.py
+++ b/tests/test_value_matching_algorithms.py
@@ -1,19 +1,19 @@
 import unittest
 import pandas as pd
 from bdikit.mapping_algorithms.value_mapping.algorithms import (
-    TFIDFAlgorithm,
-    EditAlgorithm,
+    TFIDFValueMatcher,
+    EditDistanceValueMatcher,
 )
 
 
-class ValueMatchingAlgorithmsTest(unittest.TestCase):
+class ValueMatchingTest(unittest.TestCase):
 
     def test_tfidf_value_matching(self):
         # given
         current_values = ["Red Apple", "Banana", "Oorange", "Strawberry"]
         target_values = ["apple", "banana", "orange", "kiwi"]
 
-        tfidf_matcher = TFIDFAlgorithm()
+        tfidf_matcher = TFIDFValueMatcher()
 
         # when
         matches = tfidf_matcher.match(current_values, target_values)
@@ -35,7 +35,7 @@ def test_edit_distance_value_matching(self):
         current_values = ["Red Apple", "Banana", "Oorange", "Strawberry"]
         target_values = ["apple", "bananana", "orange", "kiwi"]
 
-        edit_distance_matcher = EditAlgorithm()
+        edit_distance_matcher = EditDistanceValueMatcher()
 
         # when
         matches = edit_distance_matcher.match(