From 44b8a638ae4d1777416cda58e9378a1388391e3b Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 17:28:19 +0200 Subject: [PATCH 1/6] :sparkle: Add hooks to MelusineRegex --- melusine/base.py | 61 +++++++++++++++++++++++++++++++ tests/base/test_melusine_regex.py | 53 ++++++++++++++++++++++++--- 2 files changed, 108 insertions(+), 6 deletions(-) diff --git a/melusine/base.py b/melusine/base.py index 18eb785..9776863 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -308,12 +308,16 @@ class MissingFieldError(Exception): """ +MatchData = dict[str, list[dict[str, Any]]] + + class MelusineRegex(ABC): """ Class to standardise text pattern detection using regex. """ REGEX_FLAGS: re.RegexFlag = re.IGNORECASE | re.MULTILINE + PAIRED_MATCHING_PREFIX: str = "_" # Match fields MATCH_RESULT: str = "match_result" @@ -563,6 +567,63 @@ def _describe_match_field(match_field_data: dict[str, list[dict[str, Any]]]) -> print("The following text matched positively:") _describe_match_field(match_data[self.POSITIVE_MATCH_FIELD]) + def apply_paired_matching(self, negative_match_data: MatchData, positive_match_data: MatchData) -> bool: + """ + Check if negative match is effective in the case of paired matching. + + Args: + negative_match_data: negative_match_data + positive_match_data: positive_match_data + + Returns: + effective_negative_match: negative_match adapted for paired matching + """ + effective_negative_match = False + if positive_match_data and negative_match_data: + positive_match_keys = set(positive_match_data.keys()) + + for key in negative_match_data: + if key.startswith(self.PAIRED_MATCHING_PREFIX): + if key[1:] in positive_match_keys: + effective_negative_match = True + else: + effective_negative_match = True + + return effective_negative_match + + def pre_match_hook(self, text: str) -> str: + """ + Hook to run before the Melusine regex match. + + Args: + text: input text. + + Returns: + _: Modified text. + """ + return text + + def post_match_hook(self, match_dict: dict[str, Any]) -> dict[str, Any]: + """ + Hook to run after the Melusine regex match. + + Args: + match_dict: Match results. + + Returns: + _: Modified match results. + """ + + # Paired matching + negative_match = self.apply_paired_matching( + match_dict[self.NEGATIVE_MATCH_FIELD], match_dict[self.POSITIVE_MATCH_FIELD] + ) + positive_match = bool(match_dict[self.POSITIVE_MATCH_FIELD]) + + match_dict[self.MATCH_RESULT] = positive_match and not negative_match + + return match_dict + def test(self) -> None: """ Test the MelusineRegex on the match_list and no_match_list. diff --git a/tests/base/test_melusine_regex.py b/tests/base/test_melusine_regex.py index 6235bc1..56fb1ea 100644 --- a/tests/base/test_melusine_regex.py +++ b/tests/base/test_melusine_regex.py @@ -46,13 +46,12 @@ def no_match_list(self) -> List[str]: def test_erroneous_substitution_pattern(): with pytest.raises(ValueError): - regex = VirusRegex(substitution_pattern="12345") + _ = VirusRegex(substitution_pattern="12345") def test_method_test(): regex = VirusRegex() regex.test() - assert True def test_match_method(): @@ -94,7 +93,7 @@ def test_describe_method(capfd): # Negative match on bug (group NEGATIVE_BUG) and ignore ladybug and corona virus regex.describe("The computer virus in the ladybug software caused a bug in the corona virus dashboard") - out, err = capfd.readouterr() + out, _ = capfd.readouterr() assert "NEGATIVE_BUG" in out assert "start" not in out @@ -103,18 +102,18 @@ def test_describe_method(capfd): "The computer virus in the ladybug software caused a bug in the corona virus dashboard", position=True, ) - out, err = capfd.readouterr() + out, _ = capfd.readouterr() assert "match result is : NEGATIVE" in out assert "NEGATIVE_BUG" in out assert "start" in out regex.describe("This is a dangerous virus") - out, err = capfd.readouterr() + out, _ = capfd.readouterr() assert "match result is : POSITIVE" in out assert "start" not in out regex.describe("Nada") - out, err = capfd.readouterr() + out, _ = capfd.readouterr() assert "The input text did not match anything" in out @@ -151,3 +150,45 @@ def no_match_list(self): regex = SomeRegex() assert regex.neutral is None assert regex.negative is None + + +class PairedMatchRegex(MelusineRegex): + """ + Test paired matching. + """ + + @property + def positive(self) -> Union[str, Dict[str, str]]: + return { + "test_1": r"pos_pattern_1", + "test_2": r"pos_pattern_2", + } + + @property + def negative(self) -> Optional[Union[str, Dict[str, str]]]: + return { + "_test_1": r"neg_pattern_1", + "generic": r"neg_pattern_2", + } + + @property + def match_list(self) -> List[str]: + return [ + "Test pos_pattern_1", + "pos_pattern_2", + "pos_pattern_2 and neg_pattern_1", + ] + + @property + def no_match_list(self) -> List[str]: + return [ + "test", + "Test pos_pattern_1 and neg_pattern_1", + "pos_pattern_2 and neg_pattern_2", + "pos_pattern_1 and neg_pattern_2", + ] + + +def test_paired_matching_test(): + regex = PairedMatchRegex() + regex.test() From bdacb74fcd616dbeee76b1ac7a70f30b74191ec8 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 17:39:58 +0200 Subject: [PATCH 2/6] :sparkle: Add hooks to MelusineRegex --- tests/base/test_melusine_regex.py | 38 ++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/tests/base/test_melusine_regex.py b/tests/base/test_melusine_regex.py index 56fb1ea..1816fac 100644 --- a/tests/base/test_melusine_regex.py +++ b/tests/base/test_melusine_regex.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Any import pytest @@ -152,6 +152,42 @@ def no_match_list(self): assert regex.negative is None +class PreMatchHookVirusRegex(VirusRegex): + + def pre_match_hook(self, text: str) -> str: + text = text.replace("virrrrus", "virus") + return text + + +def test_pre_match_hook(): + reg = PreMatchHookVirusRegex() + + bool_match_result = reg.get_match_result("I see a virrrrus !") + + assert bool_match_result is True + + +class PostMatchHookVirusRegex(VirusRegex): + + def post_match_hook(self, match_dict: dict[str, Any]) -> dict[str, Any]: + """Test custom post processing of match data""" + if match_dict[self.MATCH_RESULT] is True: + if "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD]: + match_dict[self.MATCH_RESULT] = False + + return match_dict + + +def test_post_match_hook(): + reg = PostMatchHookVirusRegex() + + bool_match_result = reg.get_match_result("I see a virus, a corona virus and a ladybug") + assert bool_match_result is False + + bool_match_result = reg.get_match_result("I see a virus and a ladybug") + assert bool_match_result is True + + class PairedMatchRegex(MelusineRegex): """ Test paired matching. From 223ff6bc61787c8a9e8b5420bc941dc94099b5ec Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 17:44:50 +0200 Subject: [PATCH 3/6] :sparkle: Add hooks to MelusineRegex --- melusine/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/melusine/base.py b/melusine/base.py index 9776863..98e5b68 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -487,6 +487,9 @@ def __call__(self, text: str) -> dict[str, Any]: Returns: _: Regex match results. """ + # Apply pre match hook + text = self.pre_match_hook(text) + match_dict = { self.MATCH_RESULT: False, self.NEUTRAL_MATCH_FIELD: {}, @@ -513,6 +516,9 @@ def __call__(self, text: str) -> dict[str, Any]: match_dict[self.MATCH_RESULT] = positive_match and not negative_match + # Apply post match hook + match_dict = self.post_match_hook(match_dict) + return match_dict def describe(self, text: str, position: bool = False) -> None: From e1d0ca875f348f51ca6e4cbc80649bbeabdb946a Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 17:49:00 +0200 Subject: [PATCH 4/6] :sparkle: Add hooks to MelusineRegex --- melusine/base.py | 8 ++++---- tests/base/test_melusine_regex.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/melusine/base.py b/melusine/base.py index 98e5b68..751266a 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -19,7 +19,7 @@ import logging import re from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, Iterable, TypeVar, Union +from typing import Any, Callable, Dict, Iterable, List, TypeVar, Union import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin @@ -308,7 +308,7 @@ class MissingFieldError(Exception): """ -MatchData = dict[str, list[dict[str, Any]]] +MatchData = Dict[str, List[Dict[str, Any]]] class MelusineRegex(ABC): @@ -530,7 +530,7 @@ def describe(self, text: str, position: bool = False) -> None: position: If True, print regex match start and stop positions. """ - def _describe_match_field(match_field_data: dict[str, list[dict[str, Any]]]) -> None: + def _describe_match_field(match_field_data: Dict[str, List[Dict[str, Any]]]) -> None: """ Format and print result description text. @@ -609,7 +609,7 @@ def pre_match_hook(self, text: str) -> str: """ return text - def post_match_hook(self, match_dict: dict[str, Any]) -> dict[str, Any]: + def post_match_hook(self, match_dict: Dict[str, Any]) -> Dict[str, Any]: """ Hook to run after the Melusine regex match. diff --git a/tests/base/test_melusine_regex.py b/tests/base/test_melusine_regex.py index 1816fac..03918b8 100644 --- a/tests/base/test_melusine_regex.py +++ b/tests/base/test_melusine_regex.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union, Any +from typing import Any, Dict, List, Optional, Union import pytest @@ -169,7 +169,7 @@ def test_pre_match_hook(): class PostMatchHookVirusRegex(VirusRegex): - def post_match_hook(self, match_dict: dict[str, Any]) -> dict[str, Any]: + def post_match_hook(self, match_dict: Dict[str, Any]) -> Dict[str, Any]: """Test custom post processing of match data""" if match_dict[self.MATCH_RESULT] is True: if "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD]: From e115ce8197d1bca8a4d84967970717aff2dfbfcc Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 18:01:18 +0200 Subject: [PATCH 5/6] :sparkle: Add hooks to MelusineRegex --- melusine/base.py | 4 ++-- tests/base/test_melusine_regex.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/melusine/base.py b/melusine/base.py index 751266a..1f299e2 100644 --- a/melusine/base.py +++ b/melusine/base.py @@ -530,7 +530,7 @@ def describe(self, text: str, position: bool = False) -> None: position: If True, print regex match start and stop positions. """ - def _describe_match_field(match_field_data: Dict[str, List[Dict[str, Any]]]) -> None: + def _describe_match_field(match_field_data: dict[str, list[dict[str, Any]]]) -> None: """ Format and print result description text. @@ -609,7 +609,7 @@ def pre_match_hook(self, text: str) -> str: """ return text - def post_match_hook(self, match_dict: Dict[str, Any]) -> Dict[str, Any]: + def post_match_hook(self, match_dict: dict[str, Any]) -> dict[str, Any]: """ Hook to run after the Melusine regex match. diff --git a/tests/base/test_melusine_regex.py b/tests/base/test_melusine_regex.py index 03918b8..721412d 100644 --- a/tests/base/test_melusine_regex.py +++ b/tests/base/test_melusine_regex.py @@ -153,7 +153,6 @@ def no_match_list(self): class PreMatchHookVirusRegex(VirusRegex): - def pre_match_hook(self, text: str) -> str: text = text.replace("virrrrus", "virus") return text @@ -168,12 +167,14 @@ def test_pre_match_hook(): class PostMatchHookVirusRegex(VirusRegex): - def post_match_hook(self, match_dict: Dict[str, Any]) -> Dict[str, Any]: """Test custom post processing of match data""" - if match_dict[self.MATCH_RESULT] is True: - if "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD]: - match_dict[self.MATCH_RESULT] = False + if ( + match_dict[self.MATCH_RESULT] is True + and "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] + and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD] + ): + match_dict[self.MATCH_RESULT] = False return match_dict From 5028d892fd1d561bce92f9f44f00712b2e1dd623 Mon Sep 17 00:00:00 2001 From: Hugo Perrier Date: Fri, 4 Oct 2024 18:04:30 +0200 Subject: [PATCH 6/6] :sparkle: Add hooks to MelusineRegex --- tests/base/test_melusine_regex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/base/test_melusine_regex.py b/tests/base/test_melusine_regex.py index 721412d..f056fd6 100644 --- a/tests/base/test_melusine_regex.py +++ b/tests/base/test_melusine_regex.py @@ -170,9 +170,9 @@ class PostMatchHookVirusRegex(VirusRegex): def post_match_hook(self, match_dict: Dict[str, Any]) -> Dict[str, Any]: """Test custom post processing of match data""" if ( - match_dict[self.MATCH_RESULT] is True - and "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] - and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD] + match_dict[self.MATCH_RESULT] is True + and "NEUTRAL_MEDICAL_VIRUS" in match_dict[self.NEUTRAL_MATCH_FIELD] + and "NEUTRAL_INSECT" in match_dict[self.NEUTRAL_MATCH_FIELD] ): match_dict[self.MATCH_RESULT] = False