Skip to content

Commit

Permalink
Renamed the default heuristic comparator to highlight it's far from p…
Browse files Browse the repository at this point in the history
…erfect (it sucks actually, lots of false positives).
  • Loading branch information
blais committed Jun 1, 2024
1 parent ebf1910 commit bd91be5
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 7 deletions.
2 changes: 1 addition & 1 deletion beangulp/extract_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def test_mark_duplicate_entries(self):
1970-01-02 * "Test"
Assets:Tests 20.00 USD
'''))
compare = similar.comparator()
compare = similar.heuristic_comparator()
extract.mark_duplicate_entries(entries, entries[:1], timedelta(days=2), compare)
self.assertTrue(entries[0].meta[extract.DUPLICATE])
self.assertNotIn(extract.DUPLICATE, entries[1].meta)
Expand Down
2 changes: 1 addition & 1 deletion beangulp/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def extract(self, filepath: str, existing: data.Entries) -> data.Entries:
"""
return []

cmp = staticmethod(similar.comparator())
cmp = staticmethod(similar.heuristic_comparator())

def deduplicate(self, entries: data.Entries, existing: data.Entries) -> None:
"""Mark duplicates in extracted entries.
Expand Down
12 changes: 8 additions & 4 deletions beangulp/similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def find_similar_entries(entries, existing_entries, cmp=None, window_days=2):
Args:
entries: The list of entries to classify as duplicate or note.
existing_entries: The list of entries against which to match.
comparator: A functor used to establish the similarity of two entries.
cmp: A functor used to establish the similarity of two entries.
window_days: The number of days (inclusive) before or after to scan the
entries to classify against.
Expand All @@ -50,7 +50,7 @@ def find_similar_entries(entries, existing_entries, cmp=None, window_days=2):
window_tail = datetime.timedelta(days=window_days + 1)

if cmp is None:
cmp = comparator()
cmp = heuristic_comparator()

# For each of the new entries, look at existing entries at a nearby date.
duplicates = []
Expand Down Expand Up @@ -81,7 +81,7 @@ def __getattr__(self, name):
Comparator = Callable[[data.Directive, data.Directive], bool]


def comparator(
def heuristic_comparator(
max_date_delta: datetime.timedelta | None = None, epsilon: Decimal | None = None
) -> Comparator:
"""Generic comparison function generator.
Expand All @@ -103,7 +103,7 @@ def comparator(
epsilon: A Decimal fraction representing how close the amounts are
required to be of each other. For example, Decimal("0.01") for 1%.
Returns:
A comparator predicte accepting two directives and returning a bool.
A comparator predicate accepting two directives and returning a bool.
"""

if epsilon is None:
Expand Down Expand Up @@ -173,6 +173,10 @@ def cmp(entry1: data.Directive, entry2: data.Directive) -> bool:
return cmp


# Old alias to the heuristic comparator kept for backwards compatibility.
comparator = heuristic_comparator


def amounts_map(entry):
"""Compute a mapping of (account, currency) -> Decimal balances.
Expand Down
2 changes: 1 addition & 1 deletion beangulp/similar_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_amounts_map(self, entries, _, __):
class TestSimilarityComparator(cmptest.TestCase):

def setUp(self):
self.comparator = similar.comparator(datetime.timedelta(days=2))
self.comparator = similar.heuristic_comparator(datetime.timedelta(days=2))

@loader.load_doc()
def test_simple(self, entries, _, __):
Expand Down

0 comments on commit bd91be5

Please sign in to comment.