nansencenter · akorosov · Oct 8, 2024 · Jul 25, 2024 · Jul 25, 2024 · Jul 25, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,6 @@ classifiers = [
     "Topic :: Utilities",
 ]
 keywords = ["metadata", "standards", "thesaurus", "vocabulary"]
-dependencies = ["PyYAML", "requests", "xdg;platform_system != 'Windows'"]
+dependencies = ["PyYAML", "rapidfuzz", "requests", "xdg;platform_system != 'Windows'"]
 urls = {Repository = "https://github.com/nansencenter/py-thesaurus-interface"}
 dynamic = ["version"]
diff --git a/pythesint/__init__.py b/pythesint/__init__.py
@@ -1,3 +1,2 @@
 from __future__ import absolute_import
-from pythesint.pythesint import update_all_vocabularies
-
+from pythesint.pythesint import update_all_vocabularies, vocabularies
diff --git a/pythesint/cf_vocabulary.py b/pythesint/cf_vocabulary.py
@@ -1,7 +1,9 @@
 from __future__ import absolute_import
-
 from xml.dom.minidom import parseString
+
 import requests
+from rapidfuzz.fuzz import partial_ratio, ratio
+
 from pythesint.json_vocabulary import JSONVocabulary
 
 
@@ -55,3 +57,12 @@ def _fetch_online_data(self, version=None):
                 cf_list.append(stdname)
         return cf_list
 
+    def fuzzy_search(self, search_string):
+        """Use a simple ratio scorer (Lehvenstein distance)
+        We want to find variable names which are close to the search
+        string, but it is rare that one will be a subset of the other,
+        so a simple ratio scorer is more suited than the default token
+        set ratio scorer.
+        """
+        return self._fuzzy_search(search_string, scorer=ratio,
+                                  results_limit=10, min_score=90.0)
diff --git a/pythesint/pythesint.py b/pythesint/pythesint.py
@@ -42,6 +42,7 @@ def _process_config():
                 'search_'+cnf['name']+'_list',
                 vocabulary.search)
         setattr(current_module, 'update_'+cnf['name'], vocabulary.update)
+        setattr(current_module, 'fuzzy_search_' + cnf['name'], vocabulary.fuzzy_search)
 
 vocabularies = {}
 _process_config()
diff --git a/pythesint/tests/test_cf_vocabulary.py b/pythesint/tests/test_cf_vocabulary.py
@@ -3,7 +3,7 @@
 import mock.mock as mock
 import requests
 
-from pythesint.cf_vocabulary import CFVocabulary
+from pythesint.cf_vocabulary import CFVocabulary, ratio
 
 class CFVocabularyTest(unittest.TestCase):
     def test_exception_on_unavailable_remote_file(self):
@@ -34,3 +34,12 @@ def test_fetch_version(self):
                 voc._fetch_online_data(version='9.1.5')
             mock_get.assert_called_with(
                 'https://sdfghdfghd.nersc.no', params={'version': '9.1.5'})
+
+    def test_fuzzy_search(self):
+        """Test that _fuzzy_search is called with the altered parameters
+        """
+        vocabulary = CFVocabulary('test')
+        with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
+            vocabulary.fuzzy_search('foo')
+        mock_fuzzy_search.assert_called_once_with(
+            'foo', scorer=ratio, results_limit=10, min_score=90.0)
diff --git a/pythesint/tests/test_vocabulary.py b/pythesint/tests/test_vocabulary.py
@@ -8,7 +8,7 @@
 from collections import OrderedDict
 
 import unittest
-from mock.mock import MagicMock
+import unittest.mock as mock
 
 from pythesint.vocabulary import Vocabulary
 
@@ -33,19 +33,19 @@ def test_find_keyword_get_list_not_implemented(self):
 
     def test_find_keyword_not_found(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         with self.assertRaises(IndexError):
             vocab.find_keyword('Horse')
 
     def test_find_keyword(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.find_keyword('dog'), self.dog)
         self.assertEqual(vocab.find_keyword('Animal'), self.animal)
 
     def test_search_keyword(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.search('dog'), [self.dog])
         self.assertEqual(vocab.search('Animal'), [self.cat,
                                                   self.cat2,
@@ -55,7 +55,7 @@ def test_search_keyword(self):
 
     def test_no_duplicate_in_search(self):
         vocab = Vocabulary('VOCAB MOCK')
-        vocab.get_list = MagicMock(return_value=self.test_list)
+        vocab.get_list = mock.MagicMock(return_value=self.test_list)
         self.assertEqual(vocab.search('Cat'), [self.cat, self.cat2])
 
     def test_no_empty_dict_in_sort_output(self):
@@ -94,6 +94,29 @@ def test_sort_list_aliases(self):
                 OrderedDict([('class', 'Construction'), ('kind', 'House'), ('Name', '')]),
             ])
 
+    def test_fuzzy_search(self):
+        """Test that we get results in the right order using fuzzy
+        search
+        """
+        vocabulary = Vocabulary('test')
+        with mock.patch.object(vocabulary, 'get_list') as mock_get_list:
+            d1 = {'a': 'foo', 'b': 'bar'}
+            d2 = {'a': 'baz', 'b': 'qux'}
+            d3 = {'a': 'quz', 'b': 'quux'}
+            mock_get_list.return_value = [d1, d2, d3]
+
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=10.), [d2, d1, d3])
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=30.), [d2, d1])
+            self.assertEqual(vocabulary._fuzzy_search('baz', min_score=50.), [d2])
+
+    def test_default_fuzzy_search(self):
+        """Test that _fuzzy_search is called with the default parameters
+        """
+        vocabulary = Vocabulary('test')
+        with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
+            vocabulary.fuzzy_search('foo')
+        mock_fuzzy_search.assert_called_once_with('foo')
+
 
 if __name__ == "__main__":
     # import sys;sys.argv = ['', 'Test.testName']

diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py
@@ -1,5 +1,10 @@
 from collections import OrderedDict
 
+from rapidfuzz.fuzz import token_set_ratio
+from rapidfuzz.process import extract
+from rapidfuzz.utils import default_process
+
+
 class Vocabulary(object):
     def __init__(self, name, **kwargs):
         self.name = name
@@ -104,3 +109,27 @@ def sort_list(self, list):
     def get_list(self):
         raise NotImplementedError
 
+    def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process,
+                      results_limit=10, min_score=50.0):
+        """Perform a fuzzy search on the vocabulary.
+        Fully parameterized, meant to be called by self.fuzzy_search()
+        """
+        terms_list = self.get_list()
+        choices = (' '.join(ordered_dict.values()).lower() for ordered_dict in terms_list)
+        # returns a list of tuples (choice, similarity, index)
+        # similarity is a float in [0.0, 100.0], 100.0 meaning the
+        # search string is a subset of the choice string
+        results = extract(search_string.lower(), choices,
+                          scorer=scorer, processor=processor,
+                          limit=results_limit, score_cutoff=min_score)
+
+        return [terms_list[result[2]] for result in results]
+
+    def fuzzy_search(self, search_string):
+        """Perform a fuzzy search on the vocabulary terms.
+        Uses default parameters, can be overriden in subclasses.
+        The default scorer uses token set ratio, which gives the
+        highest score when one of the strings is a subset of the other.
+        Words order does not matter.
+        """
+        return self._fuzzy_search(search_string)