diff --git a/README.md b/README.md index 78e51f9..7ec5a75 100644 --- a/README.md +++ b/README.md @@ -33,19 +33,19 @@ trie.add("abxy") assert trie.pattern() == "(?:ab(?:c|s(?:olute)?|xy?)|foo)" ``` -A Trie may be populated with zero or more strings at instantiation or via `.add`, from which method chaining is possible. Two Trie may be merged with the `+` and `+=` operators and will compare equal if their data dictionaries are equal. +A `Trie` may be populated with zero or more strings at instantiation or via `Trie.add`, from which method chaining is possible. Two instances can be merged with the `+` (new instance) and `+=` (in-place update) operators. Instances will compare equal if their data dictionaries are equal. ```py - trie = Trie() - trie += Trie("abc") - assert ( - trie + Trie().add("foo") - == Trie("abc", "foo") - == Trie(*["abc", "foo"]) - == Trie().add(*["abc", "foo"]) - == Trie().add("abc", "foo") - == Trie().add("abc").add("foo") - ) +trie = Trie() +trie += Trie("abc") +assert ( + trie + Trie().add("foo") + == Trie("abc", "foo") + == Trie(*["abc", "foo"]) + == Trie().add(*["abc", "foo"]) + == Trie().add("abc", "foo") + == Trie().add("abc").add("foo") +) ``` diff --git a/src/retrie/retrie.py b/src/retrie/retrie.py index 01c25a1..66de657 100644 --- a/src/retrie/retrie.py +++ b/src/retrie/retrie.py @@ -139,7 +139,11 @@ def parse_re_flags( cls, re_flags, # type: re_flag_type ): # type: (...) -> int - """Convert re_flags to integer.""" + """Convert re_flags to integer. + + Args: + re_flags (re.RegexFlag | int | None): The flags to cast to integer. + """ return int(re_flags) if re_flags else 0 def pattern(self): # type: (...) -> Text @@ -208,7 +212,8 @@ def __init__( Retrie.__init__(self, word_boundary=word_boundary, re_flags=re_flags) - self.trie.add(*keys) + for key in keys: # lazy exhaust in case keys is a huge generator + self.trie.add(key) @cached_property def compiled(self): # type: (...) -> Pattern[Text] @@ -218,13 +223,21 @@ def compiled(self): # type: (...) -> Pattern[Text] def is_listed( self, term # type: Text ): # type: (...) -> bool - """Return True if Pattern is found in term.""" + """Return True if Pattern is found in term. + + Args: + term (str): The string to search. + """ return bool(self.compiled.search(term)) def not_listed( self, term # type: Text ): # type: (...) -> bool - """Return True if Pattern is not found in term.""" + """Return True if Pattern is not found in term. + + Args: + term (str): The string to search. + """ return not self.is_listed(term) @@ -260,20 +273,32 @@ def __init__( def is_blacklisted( self, term # type: Text ): # type: (...) -> bool - """Return True if Pattern is found in term.""" + """Return True if Pattern is found in term. + + Args: + term (str): The string to search. + """ return self.is_listed(term) def filter( # noqa:A003 self, sequence, # type: Sequence[Text] ): # type: (...) -> Iterator[Text] - """Construct an iterator from those elements of sequence not blacklisted.""" + """Construct an iterator from those elements of sequence not blacklisted. + + Args: + sequence (Sequence): The sequence of strings to filter. + """ return filter(self.not_listed, sequence) def cleanse_text( self, term # type: Text ): # type: (...) -> Text - """Return text, removing all blacklisted terms.""" + """Return text, removing all blacklisted terms. + + Args: + term (str): The string to search. + """ return self.compiled.sub("", term) @@ -309,20 +334,32 @@ def __init__( def is_whitelisted( self, term # type: Text ): # type: (...) -> bool - """Return True if Pattern is found in term.""" + """Return True if Pattern is found in term. + + Args: + term (str): The string to search. + """ return self.is_listed(term) def filter( # noqa:A003 self, sequence, # type: Sequence[Text] ): # type: (...) -> Iterator[Text] - """Construct an iterator from whitelisted elements of sequence.""" + """Construct an iterator from whitelisted elements of sequence. + + Args: + sequence (Sequence): The sequence of strings to filter. + """ return filter(self.is_listed, sequence) def cleanse_text( self, term # type: Text ): # type: (...) -> Text - """Return text, only keeping whitelisted terms.""" + """Return text, only keeping whitelisted terms. + + Args: + term (str): The string to search. + """ return "".join(self.compiled.findall(term)) diff --git a/src/retrie/trie.py b/src/retrie/trie.py index 8c9261e..ea945fe 100644 --- a/src/retrie/trie.py +++ b/src/retrie/trie.py @@ -19,9 +19,9 @@ trie.add("abxy") assert trie.pattern() == "(?:ab(?:c|s(?:olute)?|xy?)|foo)" -A Trie may be populated with zero or more strings at instantiation or via `.add`, from -which method chaining is possible. Two Trie may be merged with the `+` and `+=` -operators and will compare equal if their data dictionaries are equal. +A :class:`Trie` may be populated with zero or more strings at instantiation or via :meth:`Trie.add`, from +which method chaining is possible. Two instances can be merged with the ``+`` (new instance) and +``+=`` (in-place update) operators. Instances will compare equal if their data dictionaries are equal. :: trie = Trie() @@ -44,11 +44,17 @@ class Trie: - """Create a Trie for a sequence of strings. + """Create a Trie with zero or more words at instantiation or (later via :meth:`Trie.add`). - The Trie can be exported to a Regex pattern, which should match much faster than a - simple Regex union. + The Trie can be exported to a Regex pattern via :meth:`Trie.pattern`, which should match + much faster than a simple Regex union. For best performance, pass the pattern to :func:`re.compile` + and cache it to avoid recompiling for every search. See also :attr:`retrie.retrie.Checklist.compiled`. + Two instances can be merged with the ``+`` (new instance) and ``+=`` (in-place update) operators. + Instances will compare equal if their data dictionaries are equal. + + Args: + word (str): A string to add to the Trie. """ __slots__ = "data" @@ -105,7 +111,11 @@ def _merge_subtrie( def add( self, *word # type: Text ): # type: (...) -> "Trie" - """Add one or more words to the current Trie.""" + """Add one or more words to the current Trie. + + Args: + word (str): A string to add to the Trie. + """ for word in word: ref = self.data for char in word: