From 09a9f5017b8a581c58cf5959648587c59c8e0a2c Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Thu, 25 Jul 2024 14:01:57 +0200 Subject: [PATCH 01/10] make vocabularies available in pythesint module --- pythesint/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pythesint/__init__.py b/pythesint/__init__.py index 92fff83..983f667 100644 --- a/pythesint/__init__.py +++ b/pythesint/__init__.py @@ -1,3 +1,2 @@ from __future__ import absolute_import -from pythesint.pythesint import update_all_vocabularies - +from pythesint.pythesint import update_all_vocabularies, vocabularies From 8412d803acd76cf3523b4890e6e0d9ae90cef9a8 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Thu, 25 Jul 2024 14:02:57 +0200 Subject: [PATCH 02/10] add rapidfuzz to dependencies --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8e3974c..5fce0a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,6 @@ classifiers = [ "Topic :: Utilities", ] keywords = ["metadata", "standards", "thesaurus", "vocabulary"] -dependencies = ["PyYAML", "requests", "xdg;platform_system != 'Windows'"] +dependencies = ["PyYAML", "rapidfuzz", "requests", "xdg;platform_system != 'Windows'"] urls = {Repository = "https://github.com/nansencenter/py-thesaurus-interface"} dynamic = ["version"] From 8a9bac39d3e74ed638eedfcf5bfaf6638898430b Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Thu, 25 Jul 2024 14:03:27 +0200 Subject: [PATCH 03/10] add fuzzy search function --- pythesint/pythesint.py | 1 + pythesint/vocabulary.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/pythesint/pythesint.py b/pythesint/pythesint.py index 613b57b..38bb53b 100755 --- a/pythesint/pythesint.py +++ b/pythesint/pythesint.py @@ -42,6 +42,7 @@ def _process_config(): 'search_'+cnf['name']+'_list', vocabulary.search) setattr(current_module, 'update_'+cnf['name'], vocabulary.update) + setattr(current_module, 'fuzzy_search_' + cnf['name'], vocabulary.fuzzy_search) vocabularies = {} _process_config() diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py index a4abe78..bc232e9 100644 --- a/pythesint/vocabulary.py +++ b/pythesint/vocabulary.py @@ -1,5 +1,10 @@ from collections import OrderedDict +from rapidfuzz.fuzz import token_set_ratio +from rapidfuzz.process import extract +from rapidfuzz.utils import default_process + + class Vocabulary(object): def __init__(self, name, **kwargs): self.name = name @@ -104,3 +109,29 @@ def sort_list(self, list): def get_list(self): raise NotImplementedError + def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process, + results_limit=10, min_score=50.0): + """Perform a fuzzy search on the vocabulary. + Fully parameterized, meant to be called by self.fuzzy_search() + """ + terms_list = self.get_list() + choices = (' '.join(ordered_dict.values()).lower() for ordered_dict in terms_list) + # returns a list of tuples (choice, similarity, index) + # similarity is a float in [0.0, 100.0], 100.0 meaning the + # search string is a subset of the choice string + results = extract(search_string.lower(), choices, + scorer=scorer, processor=processor, limit=results_limit) + + # find results matching the minimmum similarity score + # the results list is sorted by decreasing similarity score + max_index = 0 + for i, result in enumerate(results): + max_index = i + if result[1] < min_score: + break + + return [terms_list[results[i][2]] for i in range(max_index)] + + def fuzzy_search(self, search_string): + """Perform a fuzzy search on the vocabulary terms""" + return self._fuzzy_search(search_string) From 628effc2442527f4323ba6504bd212a90bb6a6fa Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Thu, 25 Jul 2024 14:04:10 +0200 Subject: [PATCH 04/10] override fuzzy search parameters for cf vocabulary --- pythesint/cf_vocabulary.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pythesint/cf_vocabulary.py b/pythesint/cf_vocabulary.py index 122719a..23d2fcf 100644 --- a/pythesint/cf_vocabulary.py +++ b/pythesint/cf_vocabulary.py @@ -1,7 +1,9 @@ from __future__ import absolute_import - from xml.dom.minidom import parseString + import requests +from rapidfuzz.fuzz import partial_ratio, ratio + from pythesint.json_vocabulary import JSONVocabulary @@ -55,3 +57,6 @@ def _fetch_online_data(self, version=None): cf_list.append(stdname) return cf_list + def fuzzy_search(self, search_string): + return self._fuzzy_search(search_string, scorer=ratio, + results_limit=10, min_score=90.0) From 9219169f129b48a8fc61ca86038278200bb3fa2e Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 07:02:07 +0000 Subject: [PATCH 05/10] fix typo --- pythesint/vocabulary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py index bc232e9..cb365a3 100644 --- a/pythesint/vocabulary.py +++ b/pythesint/vocabulary.py @@ -122,7 +122,7 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default results = extract(search_string.lower(), choices, scorer=scorer, processor=processor, limit=results_limit) - # find results matching the minimmum similarity score + # find results matching the minimum similarity score # the results list is sorted by decreasing similarity score max_index = 0 for i, result in enumerate(results): From 675c567b032702ca8061fb319efae0c4db9e4d71 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 09:30:11 +0000 Subject: [PATCH 06/10] update cf vocabulary fuzzy search docstring --- pythesint/cf_vocabulary.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pythesint/cf_vocabulary.py b/pythesint/cf_vocabulary.py index 23d2fcf..c1eaf2b 100644 --- a/pythesint/cf_vocabulary.py +++ b/pythesint/cf_vocabulary.py @@ -58,5 +58,11 @@ def _fetch_online_data(self, version=None): return cf_list def fuzzy_search(self, search_string): + """Use a simple ratio scorer (Lehvenstein distance) + We want to find variable names which are close to the search + string, but it is rare that one will be a subset of the other, + so a simple ratio scorer is more suited than the default token + set ratio scorer. + """ return self._fuzzy_search(search_string, scorer=ratio, results_limit=10, min_score=90.0) From 0f8a17e19bb3a91f9e9ccc1d1f95c6f393905c13 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 09:31:33 +0000 Subject: [PATCH 07/10] simplify _fuzzy_search there is a parameter in rapidfuzz..process.extract to set the minimum score --- pythesint/vocabulary.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py index cb365a3..bb9d8a0 100644 --- a/pythesint/vocabulary.py +++ b/pythesint/vocabulary.py @@ -110,7 +110,7 @@ def get_list(self): raise NotImplementedError def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process, - results_limit=10, min_score=50.0): + results_limit=10, min_score=50.0): """Perform a fuzzy search on the vocabulary. Fully parameterized, meant to be called by self.fuzzy_search() """ @@ -120,17 +120,10 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default # similarity is a float in [0.0, 100.0], 100.0 meaning the # search string is a subset of the choice string results = extract(search_string.lower(), choices, - scorer=scorer, processor=processor, limit=results_limit) + scorer=scorer, processor=processor, + limit=results_limit, score_cutoff=min_score) - # find results matching the minimum similarity score - # the results list is sorted by decreasing similarity score - max_index = 0 - for i, result in enumerate(results): - max_index = i - if result[1] < min_score: - break - - return [terms_list[results[i][2]] for i in range(max_index)] + return [terms_list[result[2]] for result in results] def fuzzy_search(self, search_string): """Perform a fuzzy search on the vocabulary terms""" From f817e09e25b947ab27bf23878e9d4c8b4db04655 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 09:31:48 +0000 Subject: [PATCH 08/10] update default fuzzy_search docstring --- pythesint/vocabulary.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pythesint/vocabulary.py b/pythesint/vocabulary.py index bb9d8a0..80b2fea 100644 --- a/pythesint/vocabulary.py +++ b/pythesint/vocabulary.py @@ -126,5 +126,10 @@ def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default return [terms_list[result[2]] for result in results] def fuzzy_search(self, search_string): - """Perform a fuzzy search on the vocabulary terms""" + """Perform a fuzzy search on the vocabulary terms. + Uses default parameters, can be overriden in subclasses. + The default scorer uses token set ratio, which gives the + highest score when one of the strings is a subset of the other. + Words order does not matter. + """ return self._fuzzy_search(search_string) From 971b8eca9402e1f65f7874fe02b7677aa46787e9 Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 09:32:04 +0000 Subject: [PATCH 09/10] add tests for fuzzy search --- pythesint/tests/test_vocabulary.py | 33 +++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/pythesint/tests/test_vocabulary.py b/pythesint/tests/test_vocabulary.py index 3c88448..17dc98b 100644 --- a/pythesint/tests/test_vocabulary.py +++ b/pythesint/tests/test_vocabulary.py @@ -8,7 +8,7 @@ from collections import OrderedDict import unittest -from mock.mock import MagicMock +import unittest.mock as mock from pythesint.vocabulary import Vocabulary @@ -33,19 +33,19 @@ def test_find_keyword_get_list_not_implemented(self): def test_find_keyword_not_found(self): vocab = Vocabulary('VOCAB MOCK') - vocab.get_list = MagicMock(return_value=self.test_list) + vocab.get_list = mock.MagicMock(return_value=self.test_list) with self.assertRaises(IndexError): vocab.find_keyword('Horse') def test_find_keyword(self): vocab = Vocabulary('VOCAB MOCK') - vocab.get_list = MagicMock(return_value=self.test_list) + vocab.get_list = mock.MagicMock(return_value=self.test_list) self.assertEqual(vocab.find_keyword('dog'), self.dog) self.assertEqual(vocab.find_keyword('Animal'), self.animal) def test_search_keyword(self): vocab = Vocabulary('VOCAB MOCK') - vocab.get_list = MagicMock(return_value=self.test_list) + vocab.get_list = mock.MagicMock(return_value=self.test_list) self.assertEqual(vocab.search('dog'), [self.dog]) self.assertEqual(vocab.search('Animal'), [self.cat, self.cat2, @@ -55,7 +55,7 @@ def test_search_keyword(self): def test_no_duplicate_in_search(self): vocab = Vocabulary('VOCAB MOCK') - vocab.get_list = MagicMock(return_value=self.test_list) + vocab.get_list = mock.MagicMock(return_value=self.test_list) self.assertEqual(vocab.search('Cat'), [self.cat, self.cat2]) def test_no_empty_dict_in_sort_output(self): @@ -94,6 +94,29 @@ def test_sort_list_aliases(self): OrderedDict([('class', 'Construction'), ('kind', 'House'), ('Name', '')]), ]) + def test_fuzzy_search(self): + """Test that we get results in the right order using fuzzy + search + """ + vocabulary = Vocabulary('test') + with mock.patch.object(vocabulary, 'get_list') as mock_get_list: + d1 = {'a': 'foo', 'b': 'bar'} + d2 = {'a': 'baz', 'b': 'qux'} + d3 = {'a': 'quz', 'b': 'quux'} + mock_get_list.return_value = [d1, d2, d3] + + self.assertEqual(vocabulary._fuzzy_search('baz', min_score=10.), [d2, d1, d3]) + self.assertEqual(vocabulary._fuzzy_search('baz', min_score=30.), [d2, d1]) + self.assertEqual(vocabulary._fuzzy_search('baz', min_score=50.), [d2]) + + def test_default_fuzzy_search(self): + """Test that _fuzzy_search is called with the default parameters + """ + vocabulary = Vocabulary('test') + with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search: + vocabulary.fuzzy_search('foo') + mock_fuzzy_search.assert_called_once_with('foo') + if __name__ == "__main__": # import sys;sys.argv = ['', 'Test.testName'] From f447e84655e70bc4939f3688a554b29e5554dcda Mon Sep 17 00:00:00 2001 From: Adrien Perrin Date: Mon, 7 Oct 2024 09:32:18 +0000 Subject: [PATCH 10/10] add test for cf vocabulary fuzzy search --- pythesint/tests/test_cf_vocabulary.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pythesint/tests/test_cf_vocabulary.py b/pythesint/tests/test_cf_vocabulary.py index 485cde6..dc90aa9 100644 --- a/pythesint/tests/test_cf_vocabulary.py +++ b/pythesint/tests/test_cf_vocabulary.py @@ -3,7 +3,7 @@ import mock.mock as mock import requests -from pythesint.cf_vocabulary import CFVocabulary +from pythesint.cf_vocabulary import CFVocabulary, ratio class CFVocabularyTest(unittest.TestCase): def test_exception_on_unavailable_remote_file(self): @@ -34,3 +34,12 @@ def test_fetch_version(self): voc._fetch_online_data(version='9.1.5') mock_get.assert_called_with( 'https://sdfghdfghd.nersc.no', params={'version': '9.1.5'}) + + def test_fuzzy_search(self): + """Test that _fuzzy_search is called with the altered parameters + """ + vocabulary = CFVocabulary('test') + with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search: + vocabulary.fuzzy_search('foo') + mock_fuzzy_search.assert_called_once_with( + 'foo', scorer=ratio, results_limit=10, min_score=90.0)