From 63bb71a5acd0d49a2ceee15098485bc34b0e8864 Mon Sep 17 00:00:00 2001 From: Jean Abou Samra Date: Sun, 12 Feb 2023 02:46:30 +0100 Subject: [PATCH 1/2] In fuzzy matching, also .lower().strip() fuzzy candidates This seems intended at easing fuzzy matching with trivial edits in the msgstr (changing case and adding whitespace), but it was only done on the new msgstr, not on the old mgstr candidates, so it was possible for merging catalogs to miss messages. --- babel/messages/catalog.py | 15 +++++++++------ tests/messages/test_catalog.py | 10 +++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py index dead4aac7..190264346 100644 --- a/babel/messages/catalog.py +++ b/babel/messages/catalog.py @@ -803,10 +803,13 @@ def update( # Prepare for fuzzy matching fuzzy_candidates = [] if not no_fuzzy_matching: - fuzzy_candidates = { - self._key_for(msgid): messages[msgid].context - for msgid in messages if msgid and messages[msgid].string - } + fuzzy_candidates = {} + for msgid in messages: + if msgid and messages[msgid].string: + key = self._key_for(msgid) + ctxt = messages[msgid].context + modified_key = key.lower().strip() + fuzzy_candidates[modified_key] = (key, ctxt) fuzzy_matches = set() def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, str] | str) -> None: @@ -861,8 +864,8 @@ def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, s matches = get_close_matches(matchkey.lower().strip(), fuzzy_candidates.keys(), 1) if matches: - newkey = matches[0] - newctxt = fuzzy_candidates[newkey] + modified_key = matches[0] + newkey, newctxt = fuzzy_candidates[modified_key] if newctxt is not None: newkey = newkey, newctxt _merge(message, newkey, key) diff --git a/tests/messages/test_catalog.py b/tests/messages/test_catalog.py index 273c83f51..c2e7aeda0 100644 --- a/tests/messages/test_catalog.py +++ b/tests/messages/test_catalog.py @@ -121,16 +121,16 @@ def test_update_message_updates_comments(self): def test_update_fuzzy_matching_with_case_change(self): cat = catalog.Catalog() - cat.add('foo', 'Voh') + cat.add('FOO', 'Voh') cat.add('bar', 'Bahr') tmpl = catalog.Catalog() - tmpl.add('Foo') + tmpl.add('foo') cat.update(tmpl) assert len(cat.obsolete) == 1 - assert 'foo' not in cat + assert 'FOO' not in cat - assert cat['Foo'].string == 'Voh' - assert cat['Foo'].fuzzy is True + assert cat['foo'].string == 'Voh' + assert cat['foo'].fuzzy is True def test_update_fuzzy_matching_with_char_change(self): cat = catalog.Catalog() From c8b7ac5b7ae1428aa6315abb6b90af0c03ab4e30 Mon Sep 17 00:00:00 2001 From: Jean Abou Samra Date: Sun, 12 Feb 2023 02:49:28 +0100 Subject: [PATCH 2/2] Turn off difflib "autojunk" heuristic in fuzzy matching difflib has a heuristic that used to make fuzzy matching unreliable for >200char strings. See https://github.com/python/cpython/issues/90825 Fixes #969 --- babel/messages/catalog.py | 28 +++++++++++++++++++++++++++- tests/messages/test_catalog.py | 19 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/babel/messages/catalog.py b/babel/messages/catalog.py index 190264346..a500e77b7 100644 --- a/babel/messages/catalog.py +++ b/babel/messages/catalog.py @@ -14,8 +14,9 @@ from collections import OrderedDict from collections.abc import Iterable, Iterator from copy import copy -from difflib import get_close_matches +from difflib import SequenceMatcher from email import message_from_string +from heapq import nlargest from typing import TYPE_CHECKING from babel import __version__ as VERSION @@ -31,6 +32,31 @@ __all__ = ['Message', 'Catalog', 'TranslationError'] +def get_close_matches(word, possibilities, n=3, cutoff=0.6): + """A modified version of ``difflib.get_close_matches``. + + It just passes ``autojunk=False`` to the ``SequenceMatcher``, to work + around https://github.com/python/cpython/issues/90825. + """ + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher(autojunk=False) # only line changed from difflib.py + s.set_seq2(word) + for x in possibilities: + s.set_seq1(x) + if s.real_quick_ratio() >= cutoff and \ + s.quick_ratio() >= cutoff and \ + s.ratio() >= cutoff: + result.append((s.ratio(), x)) + + # Move the best scorers to head of list + result = nlargest(n, result) + # Strip scores for the best n matches + return [x for score, x in result] + PYTHON_FORMAT = re.compile(r''' \% diff --git a/tests/messages/test_catalog.py b/tests/messages/test_catalog.py index c2e7aeda0..b9d72bc39 100644 --- a/tests/messages/test_catalog.py +++ b/tests/messages/test_catalog.py @@ -209,6 +209,25 @@ def test_update_fuzzy_matching_no_cascading(self): assert cat['fooo'].string == 'Vohe' assert cat['fooo'].fuzzy is True + def test_update_fuzzy_matching_long_string(self): + lipsum = "\ +Lorem Ipsum is simply dummy text of the printing and typesetting \ +industry. Lorem Ipsum has been the industry's standard dummy text ever \ +since the 1500s, when an unknown printer took a galley of type and \ +scrambled it to make a type specimen book. It has survived not only \ +five centuries, but also the leap into electronic typesetting, \ +remaining essentially unchanged. It was popularised in the 1960s with \ +the release of Letraset sheets containing Lorem Ipsum passages, and \ +more recently with desktop publishing software like Aldus PageMaker \ +including versions of Lorem Ipsum." + cat = catalog.Catalog() + cat.add("ZZZZZZ " + lipsum, "foo") + tmpl = catalog.Catalog() + tmpl.add(lipsum + " ZZZZZZ") + cat.update(tmpl) + assert cat[lipsum + " ZZZZZZ"].fuzzy is True + assert len(cat.obsolete) == 0 + def test_update_without_fuzzy_matching(self): cat = catalog.Catalog() cat.add('fo', 'Voh')