From 09a9f5017b8a581c58cf5959648587c59c8e0a2c Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Thu, 25 Jul 2024 14:01:57 +0200
Subject: [PATCH 01/10] make vocabularies available in pythesint module

---
 pythesint/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pythesint/__init__.py b/pythesint/__init__.py
index 92fff83..983f667 100644
--- a/pythesint/__init__.py
+++ b/pythesint/__init__.py
@@ -1,3 +1,2 @@
 from __future__ import absolute_import
-from pythesint.pythesint import update_all_vocabularies
-
+from pythesint.pythesint import update_all_vocabularies, vocabularies

From 8412d803acd76cf3523b4890e6e0d9ae90cef9a8 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Thu, 25 Jul 2024 14:02:57 +0200
Subject: [PATCH 02/10] add rapidfuzz to dependencies

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8e3974c..5fce0a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,6 @@ classifiers = [
     "Topic :: Utilities",
 ]
 keywords = ["metadata", "standards", "thesaurus", "vocabulary"]
-dependencies = ["PyYAML", "requests", "xdg;platform_system != 'Windows'"]
+dependencies = ["PyYAML", "rapidfuzz", "requests", "xdg;platform_system != 'Windows'"]
 urls = {Repository = "https://github.com/nansencenter/py-thesaurus-interface"}
 dynamic = ["version"]

From 8a9bac39d3e74ed638eedfcf5bfaf6638898430b Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Thu, 25 Jul 2024 14:03:27 +0200
Subject: [PATCH 03/10] add fuzzy search function

---
 pythesint/pythesint.py  |  1 +
 pythesint/vocabulary.py | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/pythesint/pythesint.py b/pythesint/pythesint.py
index 613b57b..38bb53b 100755
--- a/pythesint/pythesint.py
+++ b/pythesint/pythesint.py
@@ -42,6 +42,7 @@ def _process_config():
                 'search_'+cnf['name']+'_list',
                 vocabulary.search)
         setattr(current_module, 'update_'+cnf['name'], vocabulary.update)
+        setattr(current_module, 'fuzzy_search_' + cnf['name'], vocabulary.fuzzy_search)
 
 vocabularies = {}
 _process_config()
diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py
index a4abe78..bc232e9 100644
--- a/pythesint/vocabulary.py
+++ b/pythesint/vocabulary.py
@@ -1,5 +1,10 @@
 from collections import OrderedDict
 
+from rapidfuzz.fuzz import token_set_ratio
+from rapidfuzz.process import extract
+from rapidfuzz.utils import default_process
+
+
 class Vocabulary(object):
     def __init__(self, name, **kwargs):
         self.name = name
@@ -104,3 +109,29 @@ def sort_list(self, list):
     def get_list(self):
         raise NotImplementedError
 
+    def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process,
+                     results_limit=10, min_score=50.0):
+        """Perform a fuzzy search on the vocabulary.
+        Fully parameterized, meant to be called by self.fuzzy_search()
+        """
+        terms_list = self.get_list()
+        choices = (' '.join(ordered_dict.values()).lower() for ordered_dict in terms_list)
+        # returns a list of tuples (choice, similarity, index)
+        # similarity is a float in [0.0, 100.0], 100.0 meaning the
+        # search string is a subset of the choice string
+        results = extract(search_string.lower(), choices,
+                          scorer=scorer, processor=processor, limit=results_limit)
+
+        # find results matching the minimmum similarity score
+        # the results list is sorted by decreasing similarity score
+        max_index = 0
+        for i, result in enumerate(results):
+            max_index = i
+            if result[1] < min_score:
+                break
+
+        return [terms_list[results[i][2]] for i in range(max_index)]
+
+    def fuzzy_search(self, search_string):
+        """Perform a fuzzy search on the vocabulary terms"""
+        return self._fuzzy_search(search_string)

From 628effc2442527f4323ba6504bd212a90bb6a6fa Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Thu, 25 Jul 2024 14:04:10 +0200
Subject: [PATCH 04/10] override fuzzy search parameters for cf vocabulary

---
 pythesint/cf_vocabulary.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pythesint/cf_vocabulary.py b/pythesint/cf_vocabulary.py
index 122719a..23d2fcf 100644
--- a/pythesint/cf_vocabulary.py
+++ b/pythesint/cf_vocabulary.py
@@ -1,7 +1,9 @@
 from __future__ import absolute_import
-
 from xml.dom.minidom import parseString
+
 import requests
+from rapidfuzz.fuzz import partial_ratio, ratio
+
 from pythesint.json_vocabulary import JSONVocabulary
 
 
@@ -55,3 +57,6 @@ def _fetch_online_data(self, version=None):
                 cf_list.append(stdname)
         return cf_list
 
+    def fuzzy_search(self, search_string):
+        return self._fuzzy_search(search_string, scorer=ratio,
+                                  results_limit=10, min_score=90.0)

From 9219169f129b48a8fc61ca86038278200bb3fa2e Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 07:02:07 +0000
Subject: [PATCH 05/10] fix typo

---
 pythesint/vocabulary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py
index bc232e9..cb365a3 100644
--- a/pythesint/vocabulary.py
+++ b/pythesint/vocabulary.py
@@ -122,7 +122,7 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default
         results = extract(search_string.lower(), choices,
                           scorer=scorer, processor=processor, limit=results_limit)
 
-        # find results matching the minimmum similarity score
+        # find results matching the minimum similarity score
         # the results list is sorted by decreasing similarity score
         max_index = 0
         for i, result in enumerate(results):

From 675c567b032702ca8061fb319efae0c4db9e4d71 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 09:30:11 +0000
Subject: [PATCH 06/10] update cf vocabulary fuzzy search docstring

---
 pythesint/cf_vocabulary.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pythesint/cf_vocabulary.py b/pythesint/cf_vocabulary.py
index 23d2fcf..c1eaf2b 100644
--- a/pythesint/cf_vocabulary.py
+++ b/pythesint/cf_vocabulary.py
@@ -58,5 +58,11 @@ def _fetch_online_data(self, version=None):
         return cf_list
 
     def fuzzy_search(self, search_string):
+        """Use a simple ratio scorer (Lehvenstein distance)
+        We want to find variable names which are close to the search
+        string, but it is rare that one will be a subset of the other,
+        so a simple ratio scorer is more suited than the default token
+        set ratio scorer.
+        """
         return self._fuzzy_search(search_string, scorer=ratio,
                                   results_limit=10, min_score=90.0)

From 0f8a17e19bb3a91f9e9ccc1d1f95c6f393905c13 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 09:31:33 +0000
Subject: [PATCH 07/10] simplify _fuzzy_search

there is a parameter in rapidfuzz..process.extract
to set the minimum score
---
 pythesint/vocabulary.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py
index cb365a3..bb9d8a0 100644
--- a/pythesint/vocabulary.py
+++ b/pythesint/vocabulary.py
@@ -110,7 +110,7 @@ def get_list(self):
         raise NotImplementedError
 
     def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process,
-                     results_limit=10, min_score=50.0):
+                      results_limit=10, min_score=50.0):
         """Perform a fuzzy search on the vocabulary.
         Fully parameterized, meant to be called by self.fuzzy_search()
         """
@@ -120,17 +120,10 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default
         # similarity is a float in [0.0, 100.0], 100.0 meaning the
         # search string is a subset of the choice string
         results = extract(search_string.lower(), choices,
-                          scorer=scorer, processor=processor, limit=results_limit)
+                          scorer=scorer, processor=processor,
+                          limit=results_limit, score_cutoff=min_score)
 
-        # find results matching the minimum similarity score
-        # the results list is sorted by decreasing similarity score
-        max_index = 0
-        for i, result in enumerate(results):
-            max_index = i
-            if result[1] < min_score:
-                break
-
-        return [terms_list[results[i][2]] for i in range(max_index)]
+        return [terms_list[result[2]] for result in results]
 
     def fuzzy_search(self, search_string):
         """Perform a fuzzy search on the vocabulary terms"""

From f817e09e25b947ab27bf23878e9d4c8b4db04655 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 09:31:48 +0000
Subject: [PATCH 08/10] update default fuzzy_search docstring

---
 pythesint/vocabulary.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py
index bb9d8a0..80b2fea 100644
--- a/pythesint/vocabulary.py
+++ b/pythesint/vocabulary.py
@@ -126,5 +126,10 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default
         return [terms_list[result[2]] for result in results]
 
     def fuzzy_search(self, search_string):
-        """Perform a fuzzy search on the vocabulary terms"""
+        """Perform a fuzzy search on the vocabulary terms.
+        Uses default parameters, can be overriden in subclasses.
+        The default scorer uses token set ratio, which gives the
+        highest score when one of the strings is a subset of the other.
+        Words order does not matter.
+        """
         return self._fuzzy_search(search_string)

From 971b8eca9402e1f65f7874fe02b7677aa46787e9 Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 09:32:04 +0000
Subject: [PATCH 09/10] add tests for fuzzy search

---
 pythesint/tests/test_vocabulary.py | 33 +++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/pythesint/tests/test_vocabulary.py b/pythesint/tests/test_vocabulary.py
index 3c88448..17dc98b 100644
--- a/pythesint/tests/test_vocabulary.py
+++ b/pythesint/tests/test_vocabulary.py
@@ -8,7 +8,7 @@
 from collections import OrderedDict
 
 import unittest
-from mock.mock import MagicMock
+import unittest.mock as mock
 
 from pythesint.vocabulary import Vocabulary
 
@@ -33,19 +33,19 @@ def test_find_keyword_get_list_not_implemented(self):
 
     def test_find_keyword_not_found(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         with self.assertRaises(IndexError):
             vocab.find_keyword('Horse')
 
     def test_find_keyword(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.find_keyword('dog'), self.dog)
         self.assertEqual(vocab.find_keyword('Animal'), self.animal)
 
     def test_search_keyword(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.search('dog'), [self.dog])
         self.assertEqual(vocab.search('Animal'), [self.cat,
                                                   self.cat2,
@@ -55,7 +55,7 @@ def test_search_keyword(self):
 
     def test_no_duplicate_in_search(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.search('Cat'), [self.cat, self.cat2])
 
     def test_no_empty_dict_in_sort_output(self):
@@ -94,6 +94,29 @@ def test_sort_list_aliases(self):
                 OrderedDict([('class', 'Construction'), ('kind', 'House'), ('Name', '')]),
             ])
 
+    def test_fuzzy_search(self):
+        """Test that we get results in the right order using fuzzy
+        search
+        """
+        vocabulary = Vocabulary('test')
+        with mock.patch.object(vocabulary, 'get_list') as mock_get_list:
+            d1 = {'a': 'foo', 'b': 'bar'}
+            d2 = {'a': 'baz', 'b': 'qux'}
+            d3 = {'a': 'quz', 'b': 'quux'}
+            mock_get_list.return_value = [d1, d2, d3]
+
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=10.), [d2, d1, d3])
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=30.), [d2, d1])
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=50.), [d2])
+
+    def test_default_fuzzy_search(self):
+        """Test that _fuzzy_search is called with the default parameters
+        """
+        vocabulary = Vocabulary('test')
+        with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
+            vocabulary.fuzzy_search('foo')
+        mock_fuzzy_search.assert_called_once_with('foo')
+
 
 if __name__ == "__main__":
     # import sys;sys.argv = ['', 'Test.testName']

From f447e84655e70bc4939f3688a554b29e5554dcda Mon Sep 17 00:00:00 2001
From: Adrien Perrin <adrien.perrin@nersc.no>
Date: Mon, 7 Oct 2024 09:32:18 +0000
Subject: [PATCH 10/10] add test for cf vocabulary fuzzy search

---
 pythesint/tests/test_cf_vocabulary.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/pythesint/tests/test_cf_vocabulary.py b/pythesint/tests/test_cf_vocabulary.py
index 485cde6..dc90aa9 100644
--- a/pythesint/tests/test_cf_vocabulary.py
+++ b/pythesint/tests/test_cf_vocabulary.py
@@ -3,7 +3,7 @@
 import mock.mock as mock
 import requests
 
-from pythesint.cf_vocabulary import CFVocabulary
+from pythesint.cf_vocabulary import CFVocabulary, ratio
 
 class CFVocabularyTest(unittest.TestCase):
     def test_exception_on_unavailable_remote_file(self):
@@ -34,3 +34,12 @@ def test_fetch_version(self):
                 voc._fetch_online_data(version='9.1.5')
             mock_get.assert_called_with(
                 'https://sdfghdfghd.nersc.no', params={'version': '9.1.5'})
+
+    def test_fuzzy_search(self):
+        """Test that _fuzzy_search is called with the altered parameters
+        """
+        vocabulary = CFVocabulary('test')
+        with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
+            vocabulary.fuzzy_search('foo')
+        mock_fuzzy_search.assert_called_once_with(
+            'foo', scorer=ratio, results_limit=10, min_score=90.0)