Skip to content

Commit

Permalink
Merge pull request #970 from jeanas/autojunk
Browse files Browse the repository at this point in the history
Fix two issues with fuzzy matching
  • Loading branch information
akx committed Feb 20, 2023
2 parents 08af5e2 + c8b7ac5 commit c76f1d4
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 12 deletions.
43 changes: 36 additions & 7 deletions babel/messages/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
from collections import OrderedDict
from collections.abc import Iterable, Iterator
from copy import copy
from difflib import get_close_matches
from difflib import SequenceMatcher
from email import message_from_string
from heapq import nlargest
from typing import TYPE_CHECKING

from babel import __version__ as VERSION
Expand All @@ -31,6 +32,31 @@

__all__ = ['Message', 'Catalog', 'TranslationError']

def get_close_matches(word, possibilities, n=3, cutoff=0.6):
"""A modified version of ``difflib.get_close_matches``.
It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work
around https://github.com/python/cpython/issues/90825.
"""
if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not 0.0 <= cutoff <= 1.0:
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s = SequenceMatcher(autojunk=False) # only line changed from difflib.py
s.set_seq2(word)
for x in possibilities:
s.set_seq1(x)
if s.real_quick_ratio() >= cutoff and \
s.quick_ratio() >= cutoff and \
s.ratio() >= cutoff:
result.append((s.ratio(), x))

# Move the best scorers to head of list
result = nlargest(n, result)
# Strip scores for the best n matches
return [x for score, x in result]


PYTHON_FORMAT = re.compile(r'''
\%
Expand Down Expand Up @@ -803,10 +829,13 @@ def update(
# Prepare for fuzzy matching
fuzzy_candidates = []
if not no_fuzzy_matching:
fuzzy_candidates = {
self._key_for(msgid): messages[msgid].context
for msgid in messages if msgid and messages[msgid].string
}
fuzzy_candidates = {}
for msgid in messages:
if msgid and messages[msgid].string:
key = self._key_for(msgid)
ctxt = messages[msgid].context
modified_key = key.lower().strip()
fuzzy_candidates[modified_key] = (key, ctxt)
fuzzy_matches = set()

def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None:
Expand Down Expand Up @@ -861,8 +890,8 @@ def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, s
matches = get_close_matches(matchkey.lower().strip(),
fuzzy_candidates.keys(), 1)
if matches:
newkey = matches[0]
newctxt = fuzzy_candidates[newkey]
modified_key = matches[0]
newkey, newctxt = fuzzy_candidates[modified_key]
if newctxt is not None:
newkey = newkey, newctxt
_merge(message, newkey, key)
Expand Down
29 changes: 24 additions & 5 deletions tests/messages/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,16 @@ def test_update_message_updates_comments(self):

def test_update_fuzzy_matching_with_case_change(self):
cat = catalog.Catalog()
cat.add('foo', 'Voh')
cat.add('FOO', 'Voh')
cat.add('bar', 'Bahr')
tmpl = catalog.Catalog()
tmpl.add('Foo')
tmpl.add('foo')
cat.update(tmpl)
assert len(cat.obsolete) == 1
assert 'foo' not in cat
assert 'FOO' not in cat

assert cat['Foo'].string == 'Voh'
assert cat['Foo'].fuzzy is True
assert cat['foo'].string == 'Voh'
assert cat['foo'].fuzzy is True

def test_update_fuzzy_matching_with_char_change(self):
cat = catalog.Catalog()
Expand Down Expand Up @@ -209,6 +209,25 @@ def test_update_fuzzy_matching_no_cascading(self):
assert cat['fooo'].string == 'Vohe'
assert cat['fooo'].fuzzy is True

def test_update_fuzzy_matching_long_string(self):
lipsum = "\
Lorem Ipsum is simply dummy text of the printing and typesetting \
industry. Lorem Ipsum has been the industry's standard dummy text ever \
since the 1500s, when an unknown printer took a galley of type and \
scrambled it to make a type specimen book. It has survived not only \
five centuries, but also the leap into electronic typesetting, \
remaining essentially unchanged. It was popularised in the 1960s with \
the release of Letraset sheets containing Lorem Ipsum passages, and \
more recently with desktop publishing software like Aldus PageMaker \
including versions of Lorem Ipsum."
cat = catalog.Catalog()
cat.add("ZZZZZZ " + lipsum, "foo")
tmpl = catalog.Catalog()
tmpl.add(lipsum + " ZZZZZZ")
cat.update(tmpl)
assert cat[lipsum + " ZZZZZZ"].fuzzy is True
assert len(cat.obsolete) == 0

def test_update_without_fuzzy_matching(self):
cat = catalog.Catalog()
cat.add('fo', 'Voh')
Expand Down

0 comments on commit c76f1d4

Please sign in to comment.