Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix two issues with fuzzy matching #970

Merged
merged 2 commits into from
Feb 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 36 additions & 7 deletions babel/messages/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from collections import OrderedDict
from collections.abc import Iterable, Iterator
from copy import copy
from difflib import get_close_matches
from difflib import SequenceMatcher
from email import message_from_string
from heapq import nlargest
from typing import TYPE_CHECKING

from babel import __version__ as VERSION
Expand All @@ -31,6 +32,31 @@

__all__ = ['Message', 'Catalog', 'TranslationError']

def get_close_matches(word, possibilities, n=3, cutoff=0.6):
"""A modified version of ``difflib.get_close_matches``.

It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
around https://github.com/python/cpython/issues/90825.
"""
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not 0.0 <= cutoff <= 1.0:
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
s.set_seq2(word)
for x in possibilities:
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and \
s.quick_ratio() >= cutoff and \
s.ratio() >= cutoff:
result.append((s.ratio(), x))

# Move the best scorers to head of list
result = nlargest(n, result)
# Strip scores for the best n matches
return [x for score, x in result]


PYTHON_FORMAT = re.compile(r'''
\%
Expand Down Expand Up @@ -803,10 +829,13 @@ def update(
# Prepare for fuzzy matching
fuzzy_candidates = []
if not no_fuzzy_matching:
fuzzy_candidates = {
self._key_for(msgid): messages[msgid].context
for msgid in messages if msgid and messages[msgid].string
}
fuzzy_candidates = {}
for msgid in messages:
if msgid and messages[msgid].string:
key = self._key_for(msgid)
ctxt = messages[msgid].context
modified_key = key.lower().strip()
fuzzy_candidates[modified_key] = (key, ctxt)
fuzzy_matches = set()

def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None:
Expand Down Expand Up @@ -861,8 +890,8 @@ def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, s
matches = get_close_matches(matchkey.lower().strip(),
fuzzy_candidates.keys(), 1)
if matches:
newkey = matches[0]
newctxt = fuzzy_candidates[newkey]
modified_key = matches[0]
newkey, newctxt = fuzzy_candidates[modified_key]
if newctxt is not None:
newkey = newkey, newctxt
_merge(message, newkey, key)
Expand Down
29 changes: 24 additions & 5 deletions tests/messages/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,16 @@ def test_update_message_updates_comments(self):

def test_update_fuzzy_matching_with_case_change(self):
cat = catalog.Catalog()
cat.add('foo', 'Voh')
cat.add('FOO', 'Voh')
cat.add('bar', 'Bahr')
tmpl = catalog.Catalog()
tmpl.add('Foo')
tmpl.add('foo')
cat.update(tmpl)
assert len(cat.obsolete) == 1
assert 'foo' not in cat
assert 'FOO' not in cat

assert cat['Foo'].string == 'Voh'
assert cat['Foo'].fuzzy is True
assert cat['foo'].string == 'Voh'
assert cat['foo'].fuzzy is True

def test_update_fuzzy_matching_with_char_change(self):
cat = catalog.Catalog()
Expand Down Expand Up @@ -209,6 +209,25 @@ def test_update_fuzzy_matching_no_cascading(self):
assert cat['fooo'].string == 'Vohe'
assert cat['fooo'].fuzzy is True

def test_update_fuzzy_matching_long_string(self):
lipsum = "\
Lorem Ipsum is simply dummy text of the printing and typesetting \
industry. Lorem Ipsum has been the industry's standard dummy text ever \
since the 1500s, when an unknown printer took a galley of type and \
scrambled it to make a type specimen book. It has survived not only \
five centuries, but also the leap into electronic typesetting, \
remaining essentially unchanged. It was popularised in the 1960s with \
the release of Letraset sheets containing Lorem Ipsum passages, and \
more recently with desktop publishing software like Aldus PageMaker \
including versions of Lorem Ipsum."
cat = catalog.Catalog()
cat.add("ZZZZZZ " + lipsum, "foo")
tmpl = catalog.Catalog()
tmpl.add(lipsum + " ZZZZZZ")
cat.update(tmpl)
assert cat[lipsum + " ZZZZZZ"].fuzzy is True
assert len(cat.obsolete) == 0

def test_update_without_fuzzy_matching(self):
cat = catalog.Catalog()
cat.add('fo', 'Voh')
Expand Down