Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzzy search function #74

Merged
merged 10 commits into from
Oct 8, 2024
Merged
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ classifiers = [
"Topic :: Utilities",
]
keywords = ["metadata", "standards", "thesaurus", "vocabulary"]
dependencies = ["PyYAML", "requests", "xdg;platform_system != 'Windows'"]
dependencies = ["PyYAML", "rapidfuzz", "requests", "xdg;platform_system != 'Windows'"]
urls = {Repository = "https://github.com/nansencenter/py-thesaurus-interface"}
dynamic = ["version"]
3 changes: 1 addition & 2 deletions pythesint/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
from __future__ import absolute_import
from pythesint.pythesint import update_all_vocabularies

from pythesint.pythesint import update_all_vocabularies, vocabularies
13 changes: 12 additions & 1 deletion pythesint/cf_vocabulary.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from __future__ import absolute_import

from xml.dom.minidom import parseString

import requests
from rapidfuzz.fuzz import partial_ratio, ratio

from pythesint.json_vocabulary import JSONVocabulary


Expand Down Expand Up @@ -55,3 +57,12 @@ def _fetch_online_data(self, version=None):
cf_list.append(stdname)
return cf_list

def fuzzy_search(self, search_string):
"""Use a simple ratio scorer (Lehvenstein distance)
We want to find variable names which are close to the search
string, but it is rare that one will be a subset of the other,
so a simple ratio scorer is more suited than the default token
set ratio scorer.
"""
return self._fuzzy_search(search_string, scorer=ratio,
results_limit=10, min_score=90.0)
1 change: 1 addition & 0 deletions pythesint/pythesint.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _process_config():
'search_'+cnf['name']+'_list',
vocabulary.search)
setattr(current_module, 'update_'+cnf['name'], vocabulary.update)
setattr(current_module, 'fuzzy_search_' + cnf['name'], vocabulary.fuzzy_search)

vocabularies = {}
_process_config()
11 changes: 10 additions & 1 deletion pythesint/tests/test_cf_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import mock.mock as mock
import requests

from pythesint.cf_vocabulary import CFVocabulary
from pythesint.cf_vocabulary import CFVocabulary, ratio

class CFVocabularyTest(unittest.TestCase):
def test_exception_on_unavailable_remote_file(self):
Expand Down Expand Up @@ -34,3 +34,12 @@ def test_fetch_version(self):
voc._fetch_online_data(version='9.1.5')
mock_get.assert_called_with(
'https://sdfghdfghd.nersc.no', params={'version': '9.1.5'})

def test_fuzzy_search(self):
"""Test that _fuzzy_search is called with the altered parameters
"""
vocabulary = CFVocabulary('test')
with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
vocabulary.fuzzy_search('foo')
mock_fuzzy_search.assert_called_once_with(
'foo', scorer=ratio, results_limit=10, min_score=90.0)
33 changes: 28 additions & 5 deletions pythesint/tests/test_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import OrderedDict

import unittest
from mock.mock import MagicMock
import unittest.mock as mock

from pythesint.vocabulary import Vocabulary

Expand All @@ -33,19 +33,19 @@ def test_find_keyword_get_list_not_implemented(self):

def test_find_keyword_not_found(self):
vocab = Vocabulary('VOCAB MOCK')
vocab.get_list = MagicMock(return_value=self.test_list)
vocab.get_list = mock.MagicMock(return_value=self.test_list)
with self.assertRaises(IndexError):
vocab.find_keyword('Horse')

def test_find_keyword(self):
vocab = Vocabulary('VOCAB MOCK')
vocab.get_list = MagicMock(return_value=self.test_list)
vocab.get_list = mock.MagicMock(return_value=self.test_list)
self.assertEqual(vocab.find_keyword('dog'), self.dog)
self.assertEqual(vocab.find_keyword('Animal'), self.animal)

def test_search_keyword(self):
vocab = Vocabulary('VOCAB MOCK')
vocab.get_list = MagicMock(return_value=self.test_list)
vocab.get_list = mock.MagicMock(return_value=self.test_list)
self.assertEqual(vocab.search('dog'), [self.dog])
self.assertEqual(vocab.search('Animal'), [self.cat,
self.cat2,
Expand All @@ -55,7 +55,7 @@ def test_search_keyword(self):

def test_no_duplicate_in_search(self):
vocab = Vocabulary('VOCAB MOCK')
vocab.get_list = MagicMock(return_value=self.test_list)
vocab.get_list = mock.MagicMock(return_value=self.test_list)
self.assertEqual(vocab.search('Cat'), [self.cat, self.cat2])

def test_no_empty_dict_in_sort_output(self):
Expand Down Expand Up @@ -94,6 +94,29 @@ def test_sort_list_aliases(self):
OrderedDict([('class', 'Construction'), ('kind', 'House'), ('Name', '')]),
])

def test_fuzzy_search(self):
"""Test that we get results in the right order using fuzzy
search
"""
vocabulary = Vocabulary('test')
with mock.patch.object(vocabulary, 'get_list') as mock_get_list:
d1 = {'a': 'foo', 'b': 'bar'}
d2 = {'a': 'baz', 'b': 'qux'}
d3 = {'a': 'quz', 'b': 'quux'}
mock_get_list.return_value = [d1, d2, d3]

self.assertEqual(vocabulary._fuzzy_search('baz', min_score=10.), [d2, d1, d3])
self.assertEqual(vocabulary._fuzzy_search('baz', min_score=30.), [d2, d1])
self.assertEqual(vocabulary._fuzzy_search('baz', min_score=50.), [d2])

def test_default_fuzzy_search(self):
"""Test that _fuzzy_search is called with the default parameters
"""
vocabulary = Vocabulary('test')
with mock.patch.object(vocabulary, '_fuzzy_search') as mock_fuzzy_search:
vocabulary.fuzzy_search('foo')
mock_fuzzy_search.assert_called_once_with('foo')


if __name__ == "__main__":
# import sys;sys.argv = ['', 'Test.testName']
Expand Down
29 changes: 29 additions & 0 deletions pythesint/vocabulary.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
from collections import OrderedDict

from rapidfuzz.fuzz import token_set_ratio
from rapidfuzz.process import extract
from rapidfuzz.utils import default_process


class Vocabulary(object):
def __init__(self, name, **kwargs):
self.name = name
Expand Down Expand Up @@ -104,3 +109,27 @@ def sort_list(self, list):
def get_list(self):
raise NotImplementedError

def _fuzzy_search(self, search_string, scorer=token_set_ratio, processor=default_process,
results_limit=10, min_score=50.0):
"""Perform a fuzzy search on the vocabulary.
Fully parameterized, meant to be called by self.fuzzy_search()
"""
terms_list = self.get_list()
choices = (' '.join(ordered_dict.values()).lower() for ordered_dict in terms_list)
# returns a list of tuples (choice, similarity, index)
# similarity is a float in [0.0, 100.0], 100.0 meaning the
# search string is a subset of the choice string
results = extract(search_string.lower(), choices,
scorer=scorer, processor=processor,
limit=results_limit, score_cutoff=min_score)

return [terms_list[result[2]] for result in results]

def fuzzy_search(self, search_string):
"""Perform a fuzzy search on the vocabulary terms.
Uses default parameters, can be overriden in subclasses.
The default scorer uses token set ratio, which gives the
highest score when one of the strings is a subset of the other.
Words order does not matter.
"""
return self._fuzzy_search(search_string)