From 1bcf3fc3ecca7ec9a07124af2ff41ef277ad9685 Mon Sep 17 00:00:00 2001
From: Philippe Ombredanne <pombredanne@nexb.com>
Date: Wed, 9 Oct 2024 00:05:33 +0200
Subject: [PATCH] Update required phrase generation

* This update decouples the creation of is_required_phrase rules from
  updating existing rules in a separate CLI. This makes it easier to
  control which rule are used as required phrases.

* This now skip to process more rules when adding required phrases to
  existing rules: any rule that cannot be matched approximately is
  skipped and only tiny rules, but also many other rules.

* This checks that no rule get a required phrase added that would
  break in the middle of a URL, email, or copyright. This is done by
  checking that no required phrase injection changes the set of
  ignorables of a rule and could break a URL making it no longer a
  proper URL. Same for emails or copyrights.

* This extends "skipping" the collection of required phrases to skip
  a rule from both required phrases collection for generationg new rules
  AND injection of new required phrases in rule text. This allow to
  handle exceptions more easily.

* The "is_required_phrase" rules creation now creates rules using
  improved content: the case and punctuation of the phrase text are
  preserved; the rule is created as "is_license_reference" which is
  going to be correct in the vast majority of the cases.

* When matched, the "is_required_phrase" rules are treated the same
  as continuous rules and can only be matched exactly.

* The "is_required_phrase" rules are now validated extensively to
  ensure that there is no conflict with other rule flags.

* The code to "trace" the source of a required_phase inject now uses
  the new standard "source" rule field, and the code related to handling
  this field has been simplified.

* Required phrases injection has not yet been tested as working.

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
---
 setup.cfg                                   |    1 +
 src/licensedcode/match.py                   |   10 +-
 src/licensedcode/required_phrases.py        | 1281 ++++++++++---------
 tests/licensedcode/test_required_phrases.py |  220 ++--
 4 files changed, 807 insertions(+), 705 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 5aa773107b..1c8df20664 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -159,6 +159,7 @@ console_scripts =
     scancode-license-data = licensedcode.license_db:dump_scancode_license_data
     regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
     add-required-phrases = licensedcode.required_phrases:add_required_phrases
+	gen-new-required-phrases-rules = licensedcode.required_phrases:gen_required_phrases_rules
 
 # These are configurations for ScanCode plugins as setuptools entry points.
 # Each plugin entry hast this form:
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
index b27e7503c5..7866d56632 100644
--- a/src/licensedcode/match.py
+++ b/src/licensedcode/match.py
@@ -2129,12 +2129,14 @@ def filter_matches_missing_required_phrases(
     A required phrase must be matched exactly without gaps or unknown words.
 
     A rule with "is_continuous" set to True is the same as if its whole text
-    was defined as a keyphrase and is processed here too.
+    was defined as a required phrase and is processed here too.
+    Same for a rule with "is_required_phrase" set to True.
+
     """
-    # never discard a solo match, unless matched to "is_continuous" rule
+    # never discard a solo match, unless matched to "is_continuous" or "is_required_phrase" rule
     if len(matches) == 1:
         rule = matches[0]
-        if not rule.is_continuous:
+        if not (rule.is_continuous or rule.is_required_phrase):
             return matches, []
 
     kept = []
@@ -2149,7 +2151,7 @@ def filter_matches_missing_required_phrases(
         if trace:
             logger_debug('  CHECKING KEY PHRASES for:', match)
 
-        is_continuous = match.rule.is_continuous
+        is_continuous = match.rule.is_continuous or match.rule.is_required_phrase
         ikey_spans = match.rule.required_phrase_spans
 
         if not (ikey_spans or is_continuous):
diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py
index ccc96f0ddf..32035bb060 100644
--- a/src/licensedcode/required_phrases.py
+++ b/src/licensedcode/required_phrases.py
@@ -8,183 +8,84 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import re
+
+from collections import defaultdict
+
 import attr
-import os
 import click
 
+from commoncode.cliutils import PluggableCommandLineOption
 from license_expression import Licensing
-from licensedcode import TINY_RULE
 
-from commoncode.cliutils import PluggableCommandLineOption
+from licensedcode.cache import build_index
+from licensedcode.cache import get_index
+from licensedcode.cache import get_licenses_db
+from licensedcode.models import find_rule_base_location
+from licensedcode.models import get_ignorables
+from licensedcode.models import get_normalized_ignorables
 from licensedcode.models import get_rules_by_expression
-from licensedcode.models import load_licenses
 from licensedcode.models import load_rules
-from licensedcode.models import InvalidRule
 from licensedcode.models import rules_data_dir
 from licensedcode.models import Rule
 from licensedcode.models import rule_exists
-from licensedcode.models import find_rule_base_location
-
+from licensedcode.models import update_ignorables
 from licensedcode.spans import Span
-from licensedcode.tokenize import required_phrase_tokenizer
-from licensedcode.tokenize import index_tokenizer
-from licensedcode.tokenize import return_spans_for_required_phrase_in_text
-from licensedcode.tokenize import get_ignorable_spans
-from licensedcode.tokenize import get_non_overlapping_spans
-from licensedcode.tokenize import add_required_phrase_markers
-from licensedcode.tokenize import REQUIRED_PHRASE_OPEN
+from licensedcode.stopwords import STOPWORDS
 from licensedcode.tokenize import REQUIRED_PHRASE_CLOSE
-from licensedcode.tokenize import get_normalized_tokens
-
-
-# Add the rule identifier here to trace required phrase collection or required
-# phrase marking for a specific rule (Example: "mit_12.RULE")
-TRACE_REQUIRED_PHRASE_FOR_RULES = []
-
-
-def get_required_phrase_spans(text):
-    """
-    Return a list of Spans representin required phrase token positions in the text
-    for each required phrase found in the rule ``text``.
+from licensedcode.tokenize import REQUIRED_PHRASE_OPEN
+from licensedcode.tokenize import required_phrase_tokenizer
+from licensedcode.tokenize import matched_query_text_tokenizer
+from licensedcode.tokenize import get_existing_required_phrase_spans
 
-    For example:
+"""
+This is a utility module for "required phrases".
+This is a designed to run as a command line tool with extensive debugging and tracing facilitues.
 
-    >>> text = 'This is enclosed in {{double curly braces}}'
-    >>> #       0    1  2        3    4      5     6
-    >>> x = get_required_phrase_spans(text)
-    >>> assert x == [Span(4, 6)], x
+Usage:
 
-    >>> text = 'This is {{enclosed}} a  {{double curly braces}} or not'
-    >>> #       0    1    2          SW   3      4     5        6  7
-    >>> x = get_required_phrase_spans(text)
-    >>> assert x == [Span(2), Span(3, 5)], x
+- start with gen-new-required-phrases-rules: this will create new rules from existing "required
+phrases" found in rules.
 
-    >>> text = 'This {{is}} enclosed a  {{double curly braces}} or not'
-    >>> #       0    1      2        SW   3      4     5        6  7
-    >>> x = get_required_phrase_spans(text)
-    >>> assert x == [Span([1]), Span([3, 4, 5])], x
+- regen the index
 
-    >>> text = '{{AGPL-3.0  GNU Affero General Public License v3.0}}'
-    >>> #         0    1 2  3   4      5       6      7       8  9
-    >>> x = get_required_phrase_spans(text)
-    >>> assert x == [Span(0, 9)], x
+- then continue with add-required-phrases to update existing rules with required phrases found in
+"is_required_phrase" rules and license attributes/fields.
 
-    >>> assert get_required_phrase_spans('{This}') == []
+"""
 
-    >>> def check_exception(text):
-    ...     try:
-    ...         return get_required_phrase_spans(text)
-    ...     except InvalidRule:
-    ...         pass
+# Add rule identifiers here to trace required phrase collection or required
+# phrase marking for a specific rule (Example: "mit_12.RULE")
+TRACE_REQUIRED_PHRASE_FOR_RULES = []
 
-    >>> check_exception('This {{is')
-    >>> check_exception('This }}is')
-    >>> check_exception('{{This }}is{{')
-    >>> check_exception('This }}is{{')
-    >>> check_exception('{{}}')
-    >>> check_exception('{{This is')
-    >>> check_exception('{{This is{{')
-    >>> check_exception('{{This is{{ }}')
-    >>> check_exception('{{{{This}}}}')
-    >>> check_exception('}}This {{is}}')
-    >>> check_exception('This }} {{is}}')
-    >>> check_exception('{{This}}')
-    [Span(0)]
-    >>> check_exception('{This}')
-    []
-    >>> check_exception('{{{This}}}')
-    [Span(0)]
-    """
-    return [
-        required_phrase.span
-        for required_phrase in get_required_phrases(text)
-    ]
+####################################################################################################
+#
+# Shared utilities
+#
+####################################################################################################
 
 
-def get_required_phrase_texts(text):
+def get_normalized_tokens(text, skip_required_phrase_markers=True, preserve_case=False):
     """
-    Return a list of required phrase texts for each required phrase found
-    in the rule ``text``.
-
-    For example:
-
-    >>> text = 'This is enclosed in {{double curly braces}}'
-    >>> #       0    1  2        3    4      5     6
-    >>> x = get_required_phrase_texts(text=text)
-    >>> assert x == ['double curly braces'], x
+    Return a list of normalized token strings in ``text``.
     """
-    return [
-        required_phrase.text
-        for required_phrase in get_required_phrases(text)
-    ]
-
-
-@attr.s
-class RequiredPhraseInText:
-
-    required_phrase_positions = attr.ib(
-        default=attr.Factory(list),
-        repr=False,
-        metadata=dict(help='List of positions of a required phrase in a rule text.')
-    )
-
-    required_phrase_tokens = attr.ib(
-        default=attr.Factory(list),
-        metadata=dict(help='List of required phrase tokens for this rule.')
-    )
-
-    @property
-    def text(self):
-        """The full normalized text for this required phrase, built from its tokens."""
-        return " ".join(self.required_phrase_tokens)
-
-    @property
-    def span(self):
-        """A span representing the position of this required phrase in a rule text."""
-        return Span(self.required_phrase_positions)
-
-    def update(self, token, ipos):
-        self.required_phrase_tokens.append(token)
-        self.required_phrase_positions.append(ipos)
-
-
-def get_required_phrases(text):
-    """
-    Yield RequiredPhraseInText objects with both required phrase positions
-    and lists of tokens for each required phrase found in the rule ``text``.
-    Tokens form a required phrase when enclosed in {{double curly braces}}.
-    """
-    ipos = 0
-    in_required_phrase = False
-    required_phrase = RequiredPhraseInText()
-    for token in required_phrase_tokenizer(text):
-        if token == REQUIRED_PHRASE_OPEN:
-            if in_required_phrase:
-                raise InvalidRule('Invalid rule with nested required phrase {{ {{ braces', text)
-            in_required_phrase = True
-
-        elif token == REQUIRED_PHRASE_CLOSE:
-            if in_required_phrase:
-                if required_phrase.required_phrase_tokens:
-                    yield required_phrase
-                    required_phrase = RequiredPhraseInText()
-                else:
-                    raise InvalidRule('Invalid rule with empty required phrase {{}} braces', text)
-                in_required_phrase = False
-            else:
-                raise InvalidRule(f'Invalid rule with dangling required phrase missing closing braces', text)
-            continue
-        else:
-            if in_required_phrase:
-                required_phrase.update(token=token, ipos=ipos)
-            ipos += 1
-
-    if required_phrase.required_phrase_tokens or in_required_phrase:
-        raise InvalidRule(f'Invalid rule with dangling required phrase missing final closing braces', text)
+    required_phrase_markers = [REQUIRED_PHRASE_CLOSE, REQUIRED_PHRASE_OPEN]
+    tokens = list(required_phrase_tokenizer(text=text, preserve_case=preserve_case))
+    if skip_required_phrase_markers:
+        tokens = [
+            token
+            for token in tokens
+            if token not in required_phrase_markers
+        ]
 
+    return tokens
 
 
 def get_normalized_text(text, skip_required_phrase_markers=True):
+    """
+    Return the normalized text for ``text``. Optionally ``skip_required_phrase_markers``  double
+    {{curly braces}}.
+    """
     return " ".join(
         get_normalized_tokens(
             text=text,
@@ -193,414 +94,290 @@ def get_normalized_text(text, skip_required_phrase_markers=True):
     )
 
 
-def get_num_tokens(text):
-    return len(get_normalized_tokens(text))
-
-def is_text_license_reference(text):
-
-    tokens = list(index_tokenizer(text=text))
-    words_license_reference = ['http', 'https', 'io', 'com', 'txt', 'md', 'file']
-    if any(
-        True
-        for word in words_license_reference
-        if word in tokens
+def find_phrase_spans_in_text(text, phrase_text, preserve_case=False):
+    """
+    Return a list of Spans where the ``phrase_text`` exists in ``text``, or an empty list.
+    """
+    spans_with_required_phrase = []
+
+    text_tokens = list(get_normalized_tokens(
+        text=text,
+        preserve_case=preserve_case,
+        skip_required_phrase_markers=True,
+    ))
+    required_phrase_tokens = list(get_normalized_tokens(
+        text=phrase_text,
+        preserve_case=preserve_case,
+        skip_required_phrase_markers=True,
+    ))
+    required_phrase_first_token = required_phrase_tokens[0]
+
+    # Initial check to see if all tokens in the required phrase are present
+    if all(
+        required_phrase_token in text_tokens
+        for required_phrase_token in required_phrase_tokens
     ):
-        return True
+        start_positions = [
+            i
+            for i, x in enumerate(text_tokens)
+            if x == required_phrase_first_token
+        ]
 
-    return False
+        for start_pos in start_positions:
+            end_pos = start_pos + len(required_phrase_tokens)
 
+            if (
+                end_pos <= len(text_tokens)
+                and text_tokens[start_pos:end_pos] == required_phrase_tokens
+            ):
+                spans_with_required_phrase.append(Span(start_pos, end_pos - 1))
 
-@attr.s
-class RequiredPhraseDetails:
+    return spans_with_required_phrase
 
-    license_expression = attr.ib(
-        default=None,
-        metadata=dict(
-            help='A license expression string for this particular required phrase.')
-    )
 
-    rule = attr.ib(
-        default=None,
-        metadata=dict(
-            help='The Rule object for this particular required phrase rule.')
-    )
+def get_non_overlapping_spans(old_required_phrase_spans, new_required_phrase_spans):
+    """
+    Given two list of spans `old_required_phrase_spans` and `new_required_phrase_spans`,
+    return all the spans in `new_required_phrase_spans` that do not overlap with any
+    of the spans in `old_required_phrase_spans`.
 
-    required_phrase_text = attr.ib(
-        default=None,
-        metadata=dict(
-            help='Normalized required phrase text.')
-    )
+    The list of spans `old_required_phrase_spans` contains all the spans of required
+    phrases or ignorables already present in a rule text, and the other list of spans
+    `new_required_phrase_spans` contains the proposed new required phrases.
+    """
+    for new_span in new_required_phrase_spans:
+        if old_required_phrase_spans:
+            if any(old_span.overlap(new_span) != 0 for old_span in old_required_phrase_spans):
+                continue
 
-    sources = attr.ib(
-        default=attr.Factory(list),
-        metadata=dict(
-            help='List of all rule identifiers where this required phrase is present.'
-        )
-    )
+        yield new_span
 
-    length = attr.ib(
-        default=0,
-        metadata=dict(
-            help='Length of text for this required phrase text (used to sort).'
-        )
-    )
 
-    # Generic licenses should not be dumped as required phrase rules
-    has_generic_license = attr.ib(
-        default=False,
-        metadata=dict(
-            help='Has a generic license key in its license expression'
-        )
-    )
+def add_required_phrase_markers(text, required_phrase_span):
+    """
+    Given a ``text`` and a ``required_phrase_span`` Span, add required phrase
+    curly brace markers to the ``text`` before the start and after the of the span.
+    This is taking care of whitespace and stopwords.
+    """
+    tokens_tuples_with_markers = []
+    token_index = 0
 
-    @classmethod
-    def create_required_phrase_details(
-        cls,
-        license_expression,
-        required_phrase_text,
-        sources,
-        length,
-        has_generic_license=False,
-    ):
+    for token_tuple in matched_query_text_tokenizer(text):
 
-        base_name = f"{license_expression}_required_phrase"
-        base_loc = find_rule_base_location(name_prefix=base_name)
-        file_path = f"{base_loc}.RULE"
-        identifier = file_path.split('/')[-1]
+        is_word, token = token_tuple
 
-        normalized_text = get_normalized_text(required_phrase_text)
+        if is_word and token.lower() not in STOPWORDS:
+            if token_index == required_phrase_span.start:
+                tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_OPEN))
 
-        rule = Rule(
-            license_expression=license_expression,
-            identifier=identifier,
-            text=normalized_text,
-            is_required_phrase=True,
-        )
-        if is_text_license_reference(required_phrase_text):
-            rule.is_license_reference = True
-        else:
-            rule.is_license_tag = True
+            token_index += 1
 
-        if not has_generic_license:
-            rule.dump(rules_data_dir)
+        tokens_tuples_with_markers.append(token_tuple)
 
-        return cls(
-            license_expression=license_expression,
-            rule=rule,
-            required_phrase_text=normalized_text,
-            sources=sources,
-            length=length,
-            has_generic_license=has_generic_license,
-        )
+        if is_word and token.lower() not in STOPWORDS:
+            if token_index == required_phrase_span.end + 1:
+                tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_CLOSE))
 
-    def update_sources(self, source_identifier):
-        if not source_identifier in self.sources:
-            self.sources.append(source_identifier)
+    return combine_tokens(tokens_tuples_with_markers)
 
 
-@attr.s
-class ListOfRequiredPhrases:
+def combine_tokens(token_tuples):
+    """
+    Returns a string `combined_text` combining token tuples from the list `token_tuples`,
+    which are token tuples created by the tokenizer functions.
+    """
+    return ''.join(token for _, token in token_tuples)
 
-    required_phrases = attr.ib(
-        default=attr.Factory(list),
-        metadata=dict(
-            help='A list of RequiredPhraseDetails objects for all the required phrases.')
-    )
 
-    def match_required_phrase_present(self, required_phrase_text):
-        """
-        Check if a required_phrase_text is present in the list of required_phrases
-        or it is a rule in the index.
-        Note: Order is important, as the list of required_phrases has both new rules which are
-        not yet in the index and old rules also present in the index.
-        """
-        normalized_text = get_normalized_text(required_phrase_text)
+@attr.s
+class IsRequiredPhrase:
+    """
+    Represent a required phrase text and rule from an "is_required_phrase" Rule
+    """
 
-        # check if this required_phrase_text is present in the collected list of required phrases
-        for required_phrase in self.required_phrases:
-            if required_phrase.required_phrase_text == normalized_text:
-                rule = required_phrase.rule
-                return rule
+    rule = attr.ib(metadata=dict(help='Rule that contains this phrase'))
+    required_phrase_text = attr.ib(metadata=dict(help='Normalized required phrase text.'))
 
-        # check if this required_phrase_text is present as a rule in the index
-        rule = rule_exists(text=required_phrase_text)
-        if rule:
-            return rule
+    @property
+    def license_expression(self):
+        self.rule.license_expression
 
-    def update_required_phrase_sources(self, rule, has_generic_license=False, different_license=False):
+    @staticmethod
+    def sorted(isrequiredphrases):
         """
-        Given a rule update the required phrases list with this rule
-
-        Note: this should only be called on a rule that is obtained from the
-        match_required_phrase_present function so that the rule is present in the
-        index/required phrases list. 
+        Return an ``isrequiredphrases`` list of IsRequiredPhrase sorted by decreasing text length.
         """
-        # if rule is present as a required phrase rule in the list then
-        # add identifier to sources of the required phrase rule
-        for required_phrase in self.required_phrases:
-            if required_phrase.rule.identifier == rule.identifier:
-                required_phrase.update_sources(rule.identifier)
-                return
-
-        if rule and (rule.is_license_intro or rule.is_license_clue):
-            return 
-
-        # if rule is present as a rule in the index, set the is_required_phrase flag
-        # and add to the list of required phrase rules, if it is a non-generic license of
-        # the same license expression
-        if not rule.is_required_phrase and not has_generic_license and not different_license:
-            rule.is_required_phrase = True
-            rule.dump(rules_data_dir)
+        sorter = lambda p: (len(p.rule.text), p.required_phrase_text)
+        return sorted(isrequiredphrases, key=sorter, reverse=True)
 
-        normalized_text = get_normalized_text(rule.text) 
-        required_phrase_detail = RequiredPhraseDetails(
-            license_expression=rule.license_expression,
-            rule=rule,
-            required_phrase_text=normalized_text,
-            sources=[rule.identifier],
-            length=len(normalized_text),
-            has_generic_license=has_generic_license,
-        )
-        self.required_phrases.append(required_phrase_detail)
-
-    def sort_required_phrases(self):
-        self.required_phrases = sorted(
-            self.required_phrases,
-            key=lambda x: x.length,
-            reverse=True,
-        )
 
-    def add_variations_of_required_phrases(self, licenses_by_key):
-
-        words_to_skip = ["the"]
-        for required_phrase in self.required_phrases:
-            required_phrase_tokens = list(index_tokenizer(text=required_phrase.required_phrase_text))
-            skip_words_present = [
-                skip_word
-                for skip_word in words_to_skip
-                if skip_word in required_phrase_tokens
-            ]
-            for skip_word in skip_words_present:
-                required_phrase_tokens.remove(skip_word)
-                required_phrase_without_skip_word = " ".join(required_phrase_tokens)
-                matched_rule = self.match_required_phrase_present(required_phrase_without_skip_word)
-                if matched_rule and matched_rule.skip_collecting_required_phrases:
-                    continue
-
-                has_generic_license = does_have_generic_licenses(
-                    license_expression=required_phrase.license_expression,
-                    licenses_by_key=licenses_by_key,
-                )
-                if not matched_rule:
-                    required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details(
-                        license_expression=required_phrase.license_expression,
-                        required_phrase_text=required_phrase_without_skip_word,
-                        sources=[required_phrase.rule.identifier],
-                        length=len(required_phrase_without_skip_word),
-                        has_generic_license=has_generic_license,
-                    )
-                    self.required_phrases.append(required_phrase_detail)
-                else:
-                    self.update_required_phrase_sources(
-                        rule=matched_rule,
-                        has_generic_license=has_generic_license,
-                    )
-
-
-def does_have_generic_licenses(license_expression, licenses_by_key):
-    licensing = Licensing()
-    license_keys = licensing.license_keys(license_expression)
-    has_generic_license = False
-    for lic_key in license_keys:
-        lic = licenses_by_key.get(lic_key)
-        if lic and (
-            lic.is_generic or lic.is_unknown
-        ):
-            has_generic_license = True
-            break
-
-    return has_generic_license
-
-
-def collect_required_phrases_in_rules(
-    rules_by_expression,
-    licenses_by_key,
-    license_expression=None,
-    verbose=False,
-):
-
-    # A mapping of {license_expression: ListOfRequiredPhrases} for all applicable
-    # license_expressions
-    required_phrases_by_expression = {}
-
-    licensing = Licensing()
+def collect_is_required_phrase_from_rules(rules_by_expression, verbose=False):
+    """
+    Return a mapping of ``{license_expression: list of [IsRequiredPhrase, ...]`` collecting the
+    texts of all rules in the ``rules_by_expression`` mapping if the "is_required_phrase"  is True..
+    """
+    is_required_phrases_by_expression = {}
 
-    # collect and create required phrase rules
     for license_expression, rules in rules_by_expression.items():
-
-        license_keys = licensing.license_keys(license_expression)
-        if len(license_keys) != 1:
-            continue
-
         if verbose:
             click.echo(f'Collecting required phrases for license_expression: {license_expression}')
 
-        required_phrases_list = ListOfRequiredPhrases()
+        is_required_phrases = []
 
         for rule in rules:
-            if rule.skip_collecting_required_phrases:
+            if not rule.is_required_phrase:
                 continue
 
-            if rule.is_license_intro or rule.is_license_clue:
-                continue
-
-            for required_phrase_text in get_required_phrase_texts(rule.text):
-                if get_num_tokens(required_phrase_text) < 2:
-                    if verbose:
-                        click.echo(f'WARNING: single word required phrases in: {rule.identifier}, skipping.')
-                    continue
-
-                required_phrase_rule = required_phrases_list.match_required_phrase_present(
-                    required_phrase_text=required_phrase_text,
-                )
-
-                debug = False
-                if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
-                    debug = True
-                    click.echo(
-                        f"Collecting from rule: {rule.identifier} "
-                        f"Required phrase: '{required_phrase_text}' "
-                        f"Matched rule: {required_phrase_rule}"
-                    )
-
-                if required_phrase_rule and required_phrase_rule.skip_collecting_required_phrases:
-                    continue
-
-                has_generic_license = does_have_generic_licenses(
-                    license_expression=license_expression,
-                    licenses_by_key=licenses_by_key,
-                )
-                if required_phrase_rule:
-                    different_license = required_phrase_rule.license_expression != license_expression
-                    required_phrases_list.update_required_phrase_sources(
-                        rule=required_phrase_rule,
-                        has_generic_license=has_generic_license,
-                        different_license=different_license,
-                    )
-                    if debug:
-                        click.echo(f"Old required phrase updated, same license expression")
-
-                elif not is_text_license_reference(required_phrase_text):
-                    required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details(
-                        license_expression=license_expression,
-                        required_phrase_text=required_phrase_text,
-                        sources=[rule.identifier],
-                        length=len(required_phrase_text),
-                        has_generic_license=has_generic_license,
-                    )
-                    required_phrases_list.required_phrases.append(required_phrase_detail)
-                    if debug:
-                        click.echo(f"New required phrase : {required_phrase_detail} ")
-                elif debug:
-                    is_reference = is_text_license_reference(required_phrase_text)
-                    click.echo(f"is_text_license_reference: {is_reference} ")
+            if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
+                click.echo(f"Collecting required phrase from rule: {rule.identifier}: {rule.text!r}")
 
-        # Add add new variations of the required phrases already present in the list
-        required_phrases_list.add_variations_of_required_phrases(licenses_by_key)
+            is_required_phrases.append(IsRequiredPhrase(rule=rule, required_phrase_text=rule.text))
 
-        # We need to sort required phrases by length so we look for and mark the longest possible
-        # required phrases before the shorter ones contained in the same (substrings)
-        required_phrases_list.sort_required_phrases()
-        required_phrases_by_expression[license_expression] = required_phrases_list
+        # We need to sort required phrases by decreasing length so we look for and mark the longest
+        # possible required phrases before the shorter ones contained in the same text
+        is_required_phrases = IsRequiredPhrase.sorted(is_required_phrases)
+        is_required_phrases_by_expression[license_expression] = is_required_phrases
 
         if verbose:
-            count = len(required_phrases_list.required_phrases)
-            texts_with_source = {
-                required_phrase.required_phrase_text: required_phrase.sources
-                for required_phrase in required_phrases_list.required_phrases
-            }
+            count = len(is_required_phrases)
             click.echo(f'Collected {count} required phrases for license_expression: {license_expression}')
             click.echo('Collected required phrases texts: ')
-            for text, sources in texts_with_source.items():
-                click.echo(f'{text}: {sources}')
+            for rqph in is_required_phrases:
+                click.echo(f'     {rqph.required_phrase_text!r}: {rqph.rule.identifier}')
 
-    return required_phrases_by_expression
+    return is_required_phrases_by_expression
 
 
-def update_required_phrases_from_other_rules(
+def update_required_phrases_in_rules(
     required_phrases_by_expression,
     rules_by_expression,
-    write_required_phrases=False,
+    write_phrase_source=False,
     verbose=False,
+    dry_run=False,
 ):
-
-    # add required phrases to rules from other rules
+    """
+    Update the text of rules in a ``rules_by_expression`` mapping with required phrases from the
+    ``required_phrases_by_expression`` mapping.
+    If ``write_phrase_source`` is True, include debug information in the saved rule source field.
+    """
     for license_expression, rules in rules_by_expression.items():
-        if not license_expression in required_phrases_by_expression:
+        if license_expression not in required_phrases_by_expression:
             continue
 
         if verbose:
             click.echo(f'marking required phrases in rule texts for license_expression: {license_expression}')
 
-        required_phrases_for_expression = required_phrases_by_expression.get(license_expression)
-        add_required_phrases_for_required_phrases(
+        required_phrases = required_phrases_by_expression.get(license_expression)
+        if not required_phrases:
+            continue
+
+        add_required_phrases_to_rules_text(
+            required_phrases=required_phrases,
             rules=rules,
-            required_phrases=required_phrases_for_expression.required_phrases,
-            verbose=verbose,
+            write_phrase_source=write_phrase_source,
+            dry_run=dry_run,
         )
 
-    if write_required_phrases:
-        for license_expression, required_phrases_list in required_phrases_by_expression.items():
-            if verbose:
-                click.echo(f'Writing required phrases sources for license_expression: {license_expression}')
-
-            for required_phrase_detail in required_phrases_list.required_phrases:
-                if (
-                    required_phrase_detail.sources and required_phrase_detail.rule.is_required_phrase
-                    and not required_phrase_detail.has_generic_license
-                ):    
-                    required_phrase_detail.rule.dump(
-                        rules_data_dir=rules_data_dir,
-                        sources=required_phrase_detail.sources
-                    )
-
 
-def add_required_phrases_from_other_rules(
-    licenses_by_key,
+def update_rules_using_is_required_phrases_rules(
     license_expression=None,
-    write_required_phrases=False,
+    write_phrase_source=False,
     verbose=False,
-    can_mark_required_phrase_test=False,
+    dry_run=False,
 ):
+    """
+    Add required phrases to rules using is_required_phrase rules.
+    Optionally filter rules with ``license_expression``.
+    """
+    rules_by_expression = get_base_rules_by_expression(license_expression=license_expression)
 
-    rules_by_expression = get_rules_by_expression()
-    if license_expression:
-        rules_by_expression = {license_expression: rules_by_expression[license_expression]}
-    else:
-        rules_by_expression = rules_by_expression
-
-    required_phrases_by_expression = collect_required_phrases_in_rules(
-        license_expression=license_expression,
+    required_phrases_by_expression = collect_is_required_phrase_from_rules(
         rules_by_expression=rules_by_expression,
         verbose=verbose,
-        licenses_by_key=licenses_by_key,
     )
+    if verbose:
+        click.echo(f"update_rules_using_is_required_phrases_rules: required_phrases_by_expression # {len(required_phrases_by_expression)}")
+
+    rules_by_expression = get_updatable_rules_by_expression(
+        license_expression,
+        simple_expression=False,
+    )
+    if verbose:
+        click.echo(f"update_rules_using_is_required_phrases_rules: rules_by_expression # {len(rules_by_expression)}")
 
-    update_required_phrases_from_other_rules(
+    update_required_phrases_in_rules(
         required_phrases_by_expression=required_phrases_by_expression,
         rules_by_expression=rules_by_expression,
-        write_required_phrases=write_required_phrases,
+        write_phrase_source=write_phrase_source,
         verbose=verbose,
+        dry_run=dry_run,
     )
 
 
-def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=False):
+def get_base_rules_by_expression(license_expression=None):
+    """
+    Return a mapping of rules_by_expression, filtered for an optional ``license_expression``.
+    """
+    rules_by_expression = get_rules_by_expression()
+    if license_expression:
+        rules_by_expression = {license_expression: rules_by_expression[license_expression]}
 
-    for rule in rules:
-        # skip small or required phrase rules
-        if len(rule.text) < TINY_RULE or rule.is_required_phrase:
-            continue
+    return rules_by_expression
+
+
+def get_updatable_rules_by_expression(license_expression=None, simple_expression=True):
+    """
+    Return a mapping of rules_by_expression, filtered for an optional ``license_expression``.
+    The rules are suitable to receive required phrase updates
+    If simple_expression is True, only consider lincense rules with a single license key.
+    """
+    rules_by_expression = get_base_rules_by_expression()
+
+    index = get_index()
+    licensing = Licensing()
+
+    updatable_rules_by_expression = {}
+
+    # filter rules to keep only updatable rules
+    for expression, rules in rules_by_expression.items():
+        if simple_expression:
+            license_keys = licensing.license_keys(license_expression)
+            if len(license_keys) != 1:
+                continue
+
+        updatable_rules = []
+        for rule in rules:
+            # skip required phrase, false positive, tiny and and more
+            if rule.is_required_phrase or not rule.is_approx_matchable:
+                continue
 
+            # skip rules that ask to be skipped
+            if rule.skip_for_required_phrase_generation:
+                continue
+
+            # skip non-approx matchable, they will be matche exactly
+            if not index.is_rule_approx_matchable(rule):
+                continue
+
+            updatable_rules.append(rule)
+
+        if updatable_rules:
+            updatable_rules_by_expression[expression] = updatable_rules
+
+    return updatable_rules_by_expression
+
+
+def add_required_phrases_to_rules_text(
+    required_phrases,
+    rules,
+    write_phrase_source=False,
+    dry_run=False,
+):
+    """
+    Add the ``required_phrases`` list of IsRequiredPhrase to each rule in a ``rules`` list of
+    license Rule.
+    """
+    for rule in rules:
         for required_phrase in required_phrases:
             debug = False
             if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
@@ -610,47 +387,96 @@ def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=F
                 )
                 debug = True
 
+            source = rule.source or ""
+            if write_phrase_source:
+                source += f" {required_phrase.rule.identifier}"
+
             add_required_phrase_to_rule(
                 rule=rule,
                 required_phrase=required_phrase.required_phrase_text,
-                debug_data=required_phrase.sources,
+                source=source,
                 debug=debug,
+                dry_run=dry_run,
             )
 
 
-def add_required_phrases_for_license_fields(licence_object, rules, verbose=False):
+def add_license_attributes_as_required_phrases_to_rules_text(
+    license_object,
+    rules,
+    write_phrase_source=False,
+    dry_run=False,
+):
+    """
+    Add new required phrases to the ``rules`` list of Rule using the ``license_object`` License
+    fields for required phrases.
+    """
 
     license_fields_mapping_by_order = {
-        "name": licence_object.name,
-        "short_name": licence_object.short_name,
-        #"key",
-        #"spdx_license_key"
+        "name": license_object.name,
+        "short_name": license_object.short_name,
+        # "key",
+        # "spdx_license_key",
     }
 
     for rule in rules:
-        # skip small rules
-        if len(rule.text) < TINY_RULE:
-            continue
+        for field_name, required_phrase_text in license_fields_mapping_by_order.values():
+            debug = False
+            if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
+                click.echo(
+                    f"Updating rule: {rule.identifier} "
+                    f"with required phrase from license: {field_name!r}: {required_phrase_text!r}."
+                )
+                debug = True
 
-        for license_field_value in license_fields_mapping_by_order.values():
-            add_required_phrase_to_rule(rule=rule, required_phrase=license_field_value)
+            source = rule.source or ""
+            if write_phrase_source:
+                source += f" {license_object.key}.LICENSE : {field_name}"
 
+            add_required_phrase_to_rule(
+                rule=rule,
+                required_phrase=required_phrase_text,
+                source=source,
+                debug=debug,
+                dry_run=dry_run,
+            )
+
+
+def get_ignorable_spans(rule):
+    """
+    Return a list of ignorable Spans for the ``rule``.
+    Ignorable spans are for URLs and referenced filenames present in a rule text. These should not
+    be messed up with when injecting new required phrases in a rule text.
+    """
+    ignorable_spans = []
+    ignorables = rule.referenced_filenames + rule.ignorable_urls
+    for ignorable in ignorables:
+        ignorable_spans.extend(
+            find_phrase_spans_in_text(
+                text=rule.text,
+                required_phrase=ignorable,
+                preserve_case=True,
+            )
+        )
 
-def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=False):
+    return ignorable_spans
 
-    # Reload from file as there could be changes from other license fields
-    rule_file = os.path.join(rules_data_dir, rule.identifier)
-    reloaded_rule = Rule.from_file(rule_file)
 
-    # we get spans for name/short_name if they exist
-    new_required_phrase_spans = return_spans_for_required_phrase_in_text(
-        text=reloaded_rule.text,
+def add_required_phrase_to_rule(rule, required_phrase, source, debug=False, dry_run=False):
+    """
+    Update and save the ``rule`` Rule tagging the text with the ``required_phrase`` text. Skip
+    updating and saving the rule to disk under some conditions, like if ignorables would be changed.
+    Return True if the rule was updated and False otherwise.
+    """
+
+    # These are candidate spans for new requriedf_phrases, if they exist
+    new_required_phrase_spans = find_phrase_spans_in_text(
+        text=rule.text,
         required_phrase=required_phrase,
     )
 
     # we get spans for already existing required phrases and ignorables
-    ignorable_spans = get_ignorable_spans(reloaded_rule)
-    old_required_phrase_spans = get_required_phrase_spans(reloaded_rule.text)
+    ignorable_spans = get_ignorable_spans(rule)
+    old_required_phrase_spans = get_existing_required_phrase_spans(rule.text)
 
     # we verify whether there are spans which overlap with the
     # already present required phrases or ignorables
@@ -669,120 +495,165 @@ def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=Fa
         ignorable_debug = rule.referenced_filenames + rule.ignorable_urls
         click.echo(f"debug ignorables: {ignorable_debug}")
 
-    text_rule = reloaded_rule.text
-
     # we add required phrase markers for the non-overlapping spans
+    new_rule_text = rule.text
     for span_to_add in spans_to_add:
-        text_rule = add_required_phrase_markers(
-            text=text_rule,
+        new_rule_text = add_required_phrase_markers(
+            text=new_rule_text,
             required_phrase_span=span_to_add,
         )
 
     # write the rule on disk if there are any updates
-    if text_rule != reloaded_rule.text:
+    if new_rule_text == rule.text:
+        return False
+
+    if has_ignorable_changes(rule=rule, updated_text=new_rule_text):
         if debug:
             click.echo(
-                f"Updating rule: {reloaded_rule.identifier} "
+                f"NOT Updating rule: {rule.identifier} "
+                f"because IGNORABLES would change "
                 f"with required phrase: {required_phrase} "
-                f"debug data: {debug_data} /n"
             )
-        reloaded_rule.text = text_rule
-        reloaded_rule.dump(rules_data_dir)
 
+        return False
 
-def add_required_phrases_from_license_fields(
-    licenses_by_key,
-    license_expression=None,
-    verbose=False,
-    can_mark_required_phrase_test=False,
-):
+    rule.source = source or None
+    rule.text = new_rule_text
+    if not dry_run:
+        if debug:
+            click.echo(
+                f"UPDATE: Updating rule: {rule.identifier} "
+                f"with required phrase: {required_phrase!r} "
+                f"source: {source!r}"
+            )
+        rule.dump(rules_data_dir)
+    return True
+
+
+def has_ignorable_changes(rule, updated_text):
     """
-    For all rules with the `license_expression`, add required phrases from the
-    license fields.
+    Return True if there would be changes in the "ignorable_*" attributes of a ``rule`` Rule if its
+    text was to be updated with a new ``updated_text``.
     """
-    rules_by_expression = get_rules_by_expression()
+    existing_ignorables = get_normalized_ignorables(rule)
+    updated_ignorables = get_ignorables(updated_text)
+    return existing_ignorables != updated_ignorables
 
-    if license_expression:
-        rules_by_expression_to_update = {license_expression: rules_by_expression[license_expression]}
-    else:
-        rules_by_expression_to_update = rules_by_expression
 
-    licensing = Licensing()
+def update_rules_using_license_attributes(
+    license_expression=None,
+    write_phrase_source=False,
+    verbose=False,
+    dry_run=False,
+):
+    """
+    Add required phrases found in the license fields.
 
-    for license_expression, rules in rules_by_expression_to_update.items():
+    Iterate rules by license key, collect required phrases from the license attributes like name and
+    short name. Add those as required phrases in all selected rules that are using the
+    ``license_expression``.
+    """
+    rules_by_expression = get_updatable_rules_by_expression(license_expression, simple_expression=True)
 
-        license_keys = licensing.license_keys(license_expression)
-        if len(license_keys) != 1:
-            continue
+    licenses_by_key = get_licenses_db()
 
-        license_key = license_keys.pop()    
+    # license expression is alway  a single key here
+    for license_key, rules in rules_by_expression.items():
         licence_object = licenses_by_key[license_key]
-
         if verbose:
             click.echo(f'Updating rules with required phrases for license_expression: {license_key}')
 
-        add_required_phrases_for_license_fields(licence_object=licence_object, rules=rules, verbose=verbose)
+        add_license_attributes_as_required_phrases_to_rules_text(
+            license_object=licence_object,
+            rules=rules,
+            write_phrase_source=write_phrase_source,
+            dry_run=dry_run,
+        )
+
+####################################################################################################
+#
+# Inject new required phrase in rules
+#
+####################################################################################################
 
 
-def delete_required_phrase_rules_debug(rules_data_dir):
-    required_phrase_rules = [
-        rule
-        for rule in load_rules(rules_data_dir=rules_data_dir)
-        if rule.is_required_phrase
-    ]
-    for rule in required_phrase_rules:
-        rule.dump(rules_data_dir)
+def delete_required_phrase_rules_source_debug(rules_data_dir):
+    """
+    Remove the "source" attribute from all rules.
+    """
+    for rule in load_rules(rules_data_dir=rules_data_dir):
+        if rule.source:
+            rule.source = None
+            rule.dump(rules_data_dir)
 
 
 @click.command(name='add-required-phrases')
+@click.option(
+    "-o",
+    "--from-other-rules",
+    is_flag=True,
+    default=False,
+    help="Propagate existing required phrases from other rules to all selected rules. "
+    "Mutually exclusive with --from-license-attributes.",
+    cls=PluggableCommandLineOption,
+)
+@click.option(
+    "-a",
+    "--from-license-attributes",
+    is_flag=True,
+    default=False,
+    help="Propagate license attributes as required phrases to all selected rules. "
+    "Mutually exclusive with --from-other-rule.",
+    cls=PluggableCommandLineOption,
+)
 @click.option(
     "-l",
     "--license-expression",
     type=str,
     default=None,
     metavar="STRING",
-    help="The license expression, for which the rules will be updated with required phrases. "
-    "Example STRING: `mit`. If this option is not used, add required_phrases for all rules.",
+    help="Optional license expression filter. If provided, only consider the rules that are using "
+    "this expression. Otherwise, process all rules. Example: `apache-2.0`.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
-    "-r",
-    "--reindex",
+    "--validate",
     is_flag=True,
     default=False,
-    help="Also reindex the license/rules to check for inconsistencies.",
+    help="Validate that all rules and licenses and rules are consistent, for all rule languages. "
+    "For this validation, run a mock indexing. The regenerated index is not saved to disk.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
-    "-w",
-    "--write-required-phrase-origins",
+    "-r",
+    "--reindex",
     is_flag=True,
     default=False,
-    help="Write into the rule file the sources for all required phrase rules. Deletes the temporary rule origins used to debug.",
+    help="Recreate and cache the licenses index  with updated rules add the end.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
-    "-d",
-    "--delete-required-phrase-origins",
+    "-w",
+    "--write-phrase-source",
     is_flag=True,
     default=False,
-    help="Delete the sources for all required phrase rules and exit. This is a debug option.",
+    help="In modified rule files, write the source field to trace the source of required phrases "
+    "applied to that rule.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
-    "-o",
-    "--from-other-rules",
+    "-d",
+    "--delete-phrase-source",
     is_flag=True,
     default=False,
-    help="Propagate required phrases from already marked required phrases in other rules.",
+    help="In rule files, delete the source extra debug data used to trace source of phrases.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
-    "-a",
-    "--from-license-attributes",
+    "--dry-run",
     is_flag=True,
     default=False,
-    help="Mark required phrases from license attributes.",
+    help="Do not save rules.",
     cls=PluggableCommandLineOption,
 )
 @click.option(
@@ -790,50 +661,300 @@ def delete_required_phrase_rules_debug(rules_data_dir):
     "--verbose",
     is_flag=True,
     default=False,
-    help="Print logging information.",
+    help="Print verbose logging information.",
     cls=PluggableCommandLineOption,
 )
 @click.help_option("-h", "--help")
 def add_required_phrases(
-    license_expression,
-    verbose,
-    reindex,
     from_other_rules,
     from_license_attributes,
-    delete_required_phrase_origins,
-    write_required_phrase_origins,
+    license_expression,
+    validate,
+    reindex,
+    delete_phrase_source,
+    write_phrase_source,
+    dry_run,
+    verbose,
 ):
     """
-    For all rules with the `license_expression`, add required phrases from the
-    license fields.
+    Update license detection rules with new "required phrases" to improve rules detection accuracy.
     """
-    licenses_by_key = load_licenses()
 
-    if delete_required_phrase_origins:
-        delete_required_phrase_rules_debug(rules_data_dir)
+    if delete_phrase_source:
+        click.echo('Deleting rules phrase source debug data.')
+        delete_required_phrase_rules_source_debug(rules_data_dir)
         return
 
-    # create a list of all required phrases from existing rules, add
-    # rule files for them and mark those required phrases if present in other rules
-    if from_other_rules:
-        add_required_phrases_from_other_rules(
+    elif from_other_rules:
+        click.echo('Updating rules from is_required_phrase rules.')
+        update_rules_using_is_required_phrases_rules(
             license_expression=license_expression,
-            write_required_phrases=write_required_phrase_origins,
+            write_phrase_source=write_phrase_source,
+            dry_run=dry_run,
             verbose=verbose,
-            licenses_by_key=licenses_by_key,
         )
 
-    # marks required phrases in existing rules from license attributes like name,
-    # short name and optionally license keys
-    if from_license_attributes:
-        add_required_phrases_from_license_fields(
+    elif from_license_attributes:
+        click.echo('Updating rules from license attributes.')
+        update_rules_using_license_attributes(
             license_expression=license_expression,
+            write_phrase_source=write_phrase_source,
+            dry_run=dry_run,
             verbose=verbose,
-            licenses_by_key=licenses_by_key,
         )
 
+    validate_and_reindex(validate, reindex, verbose)
+
+
+def validate_and_reindex(validate, reindex, verbose):
+    if validate:
+        if verbose:
+            click.echo('Validate all rules and licenses for all languages...')
+        build_index(index_all_languages=True)
+
     if reindex:
-        from licensedcode.cache import get_index
         if verbose:
-            click.echo('Rebuilding the license index...')
+            click.echo('Rebuilding and caching the license index...')
         get_index(force=True)
+
+####################################################################################################
+#
+# Generate new required phrase rules from existing tagged required phrases
+#
+####################################################################################################
+
+
+@click.command(name='gen-new-required-phrases-rules')
+@click.option(
+    "-l",
+    "--license-expression",
+    type=str,
+    default=None,
+    metavar="STRING",
+    help="Optional license expression filter. If provided, only consider the rules that are using "
+    "this expression. Otherwise, process all rules. Example: `apache-2.0`.",
+    cls=PluggableCommandLineOption,
+)
+@click.option(
+    "-r",
+    "--reindex",
+    is_flag=True,
+    default=False,
+    help="Recreate and cache the licenses index  with updated rules add the end.",
+    cls=PluggableCommandLineOption,
+)
+@click.option(
+    "--validate",
+    is_flag=True,
+    default=False,
+    help="Validate that all rules and licenses and rules are consistent, for all rule languages. "
+    "For this validation, run a mock indexing. The regenerated index is not saved to disk.",
+    cls=PluggableCommandLineOption,
+)
+@click.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    default=False,
+    help="Print verbose logging information.",
+    cls=PluggableCommandLineOption,
+)
+@click.help_option("-h", "--help")
+def gen_required_phrases_rules(
+    license_expression,
+    validate,
+    reindex,
+    verbose,
+):
+    """
+    Create new license detection rules from "required phrases" in existing rules.
+    """
+    generate_new_required_phrase_rules(license_expression=license_expression, verbose=verbose)
+    validate_and_reindex(validate, reindex, verbose)
+
+
+def generate_new_required_phrase_rules(license_expression=None, verbose=False):
+    """
+    Create new rules ctreated from collecting unique required phrases accross all rules.
+
+    As a side effect, also update existing rules matched to a required phrase text with the
+    "is_required_phrase" flag.
+
+    Consider only rules with the optional ``license_expression`` if provided.
+    """
+    if verbose:
+        lex = license_expression or "all"
+        click.echo(f'Collecting required phrases for {lex} license_expression.')
+
+    index = get_index()
+    licenses_by_key = get_licenses_db()
+
+    # track text -> expressions to keep only a text that uniquely identifies a single expression
+    phrases_by_normalized_phrase = defaultdict(list)
+
+    for rule in index.rules_by_rid:
+        if rule.license_expression != license_expression:
+            continue
+
+        if (
+            rule.is_required_phrase
+            or rule.skip_for_required_phrase_generation
+            or rule.is_license_intro
+            or rule.is_license_clue
+            or rule.is_false_positive
+            or rule.is_generic(licenses_by_key)
+        ):
+            continue
+
+        for required_phrase_text in get_required_phrase_verbatim(rule.text):
+            phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=required_phrase_text)
+            if phrase.is_good(rule):
+                phrases_by_normalized_phrase[phrase.normalized_text].append(phrase)
+
+                # Add new variations of the required phrases already present in the list
+                for variation in generate_required_phrase_variations(required_phrase_text):
+                    phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=variation)
+                    if phrase.is_good(rule):
+                        phrases_by_normalized_phrase[phrase.normalized_text].append(phrase)
+
+    for phrases in phrases_by_normalized_phrase.values():
+        # keep only phrases pointing used for the same expression
+        if len(set(p.license_expression for p in phrases)) == 1:
+            # keep the first one
+            phrase = phrases[0]
+        else:
+            continue
+
+        # check if we already have a rule we can match for this required phrase tag if needed
+        matched_rule = rule_exists(text=phrase.raw_text)
+        if matched_rule:
+            if matched_rule.skip_for_required_phrase_generation:
+                if verbose:
+                    click.echo(
+                        f'WARNING: Skipping pre-existing required phrase rule '
+                        f'"skip_for_required_phrase_generation": {matched_rule.identifier}.'
+                    )
+                    continue
+
+            modified = False
+
+            if not matched_rule.is_required_phrase:
+                matched_rule.is_required_phrase = True
+                modified = True
+
+            if matched_rule.text.strip() != phrase.raw_text:
+                matched_rule.text = phrase.raw_text
+                modified = True
+
+            if matched_rule.is_continuous:
+                matched_rule.is_continuous = False
+                modified = True
+
+            if modified:
+                matched_rule.dump(rules_data_dir)
+                if verbose:
+                    click.echo(f'WARNING: Updating existing rule with is_required flag and more: {matched_rule.identifier}.')
+            else:
+                if verbose:
+                    click.echo(f'WARNING: Skipping pre-existing required phrase rule: {matched_rule.identifier}.')
+
+            continue
+
+        # at last create a new rule
+        rule = phrase.create_rule()
+        if verbose:
+            click.echo(f'Creating required phrase new rule: {rule.identifier}.')
+
+
+@attr.s
+class RequiredPhraseRuleCandidate:
+    """
+    A candidate phrase object with its license expression, raw text and normalized text. Used when
+    generating new rules for requireqed phrases.
+    """
+    license_expression = attr.ib(metadata=dict(help='A license expression string.'))
+    raw_text = attr.ib(metadata=dict(help='Raw, original required phrase text.'))
+    normalized_text = attr.ib(metadata=dict(help='Normalized required phrase text.'))
+
+    def is_good(self, rule):
+        """
+        Return True if this phrase is a minimally suitable to use as a required phrase
+        """
+        # long enough
+        num_tokens = len(get_normalized_tokens(self.normalized_text))
+        if num_tokens <= 1:
+            return False
+
+        to_ignore = set()
+        # not a referenced filename
+        to_ignore.update(map(get_normalized_text, rule.referenced_filenames))
+        if self.normalized_text in to_ignore:
+            return False
+
+        return True
+
+    @classmethod
+    def create(cls, license_expression, text):
+        return cls(
+            license_expression=license_expression,
+            raw_text=text,
+            normalized_text=get_normalized_text(text),
+        )
+
+    def create_rule(self):
+        """
+        Create, save and return a new "required_phrase" Rule from this phrase.
+        """
+        base_name = f"{self.license_expression}_required_phrase"
+        base_loc = find_rule_base_location(name_prefix=base_name)
+        file_path = f"{base_loc}.RULE"
+        identifier = file_path.split('/')[-1]
+
+        rule = Rule(
+            license_expression=self.license_expression,
+            identifier=identifier,
+            text=self.raw_text,
+            is_required_phrase=True,
+            is_license_reference=True,
+        )
+        update_ignorables(licensish=rule)
+        rule.dump(rules_data_dir)
+        return rule
+
+
+_verbatim_required_phrase = r'{{([^}]+)}}'
+collect_verbatim_required_phrase = re.compile(_verbatim_required_phrase, re.UNICODE).findall
+
+
+def get_required_phrase_verbatim(text):
+    """
+    Yield required_phrase strings from a rule ``text`` excluding required phrases {{brace}} markers.
+
+    This tokenizer behaves the same as as the ``index_tokenizer`` returning also
+    REQUIRED_PHRASE_OPEN and REQUIRED_PHRASE_CLOSE as separate tokens so that they can be
+    used to parse required phrases.
+
+    >>> x = list(get_required_phrase_verbatim('bar {{ AGPL-3.0  GNU Affero License v3.0 }} foo'))
+    >>> assert x == ['AGPL-3.0  GNU Affero License v3.0'], x
+
+    >>> x = list(get_required_phrase_verbatim(' + {{ ++ AGPL-3.0/}} and {{ GNU Affero License v3.0  }}  '))
+    >>> assert x == ['++ AGPL-3.0/', 'GNU Affero License v3.0'], x
+    """
+    if not text:
+        return
+    for phrase in collect_verbatim_required_phrase(text):
+        phrase = phrase.strip()
+        if phrase:
+            yield phrase
+
+
+def generate_required_phrase_variations(text):
+    """
+    Yield strings that are useful variations of the ``text``, used to generate rule variants.
+    """
+    words_to_skip = ["the"]
+    required_phrase_words = text.split()
+    for skip_word in words_to_skip:
+        variant = [w for w in required_phrase_words if w.lower() != skip_word]
+        yield " ".join(variant)
+
diff --git a/tests/licensedcode/test_required_phrases.py b/tests/licensedcode/test_required_phrases.py
index 973294c69f..266ca8e2ff 100644
--- a/tests/licensedcode/test_required_phrases.py
+++ b/tests/licensedcode/test_required_phrases.py
@@ -7,108 +7,22 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
-import os
 from unittest import TestCase as TestCaseClass
 
 import pytest
 
-from licensedcode.required_phrases import get_required_phrases
-from licensedcode.required_phrases import get_required_phrase_spans
-from licensedcode.required_phrases import get_required_phrase_texts
-from licensedcode.required_phrases import add_required_phrases_from_other_rules
-from licensedcode.required_phrases import add_required_phrases_from_license_fields
-from licensedcode.required_phrases import ListOfRequiredPhrases
-from licensedcode.required_phrases import RequiredPhraseDetails
-from licensedcode.required_phrases import return_spans_for_required_phrase_in_text
-from licensedcode.required_phrases import add_required_phrase_markers
-from licensedcode.tokenize import get_normalized_tokens
-from licensedcode.tokenize import matched_query_text_tokenizer
-from licensedcode.stopwords import STOPWORDS
 from licensedcode.models import InvalidRule
 from licensedcode.models import Rule
+from licensedcode.required_phrases import update_rules_using_is_required_phrases_rules
+from licensedcode.required_phrases import update_rules_using_license_attributes
+from licensedcode.required_phrases import IsRequiredPhrase
+from licensedcode.required_phrases import add_required_phrase_markers
 from licensedcode.spans import Span
+from licensedcode.required_phrases import find_phrase_spans_in_text
+from licensedcode.tokenize import get_existing_required_phrase_spans
 
 
-class TestGetKeyPhrases(TestCaseClass):
-    text = (
-        'This released software is {{released}} by under {{the MIT license}}. '
-        'Which is a license originating at Massachusetts Institute of Technology (MIT).'
-    )
-
-    def test_get_required_phrases_yields_spans(self):
-        required_phrase_spans = get_required_phrase_spans(self.text)
-        assert required_phrase_spans == [Span(4), Span(7, 9)]
-
-    def test_get_required_phrases_yields_tokens(self):
-        required_phrase_tokens = [
-            required_phrase.required_phrase_tokens
-            for required_phrase in get_required_phrases(text=self.text)
-        ]
-        assert required_phrase_tokens == [['released'], ['the', 'mit', 'license']]
-
-    def test_get_required_phrase_texts(self):
-        required_phrase_texts = get_required_phrase_texts(text=self.text)
-        assert required_phrase_texts == ['released', 'the mit license']
-
-    def test_get_required_phrases_raises_exception_required_phrase_markup_is_not_closed(self):
-        text = 'This software is {{released by under the MIT license.'
-        try:
-            list(get_required_phrase_spans(text))
-            raise Exception('Exception should be raised')
-        except InvalidRule:
-            pass
-
-    def test_get_required_phrases_ignores_stopwords_in_positions(self):
-        text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.'
-        required_phrase_spans = get_required_phrase_spans(text)
-        assert required_phrase_spans == [Span(11, 12)]
-
-    def test_get_required_phrases_yields_spans_without_stop_words(self):
-        text = 'This released software is {{released span}} by under {{the MIT quot license}}.'
-        required_phrase_spans = get_required_phrase_spans(text)
-        assert required_phrase_spans == [Span(4), Span(7, 9)]
-
-    def test_get_required_phrases_does_not_yield_empty_spans(self):
-        text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.'
-        try:
-            list(get_required_phrase_spans(text))
-            raise Exception('Exception should be raised')
-        except InvalidRule:
-            pass
-
-    def test_get_required_phrases_only_considers_outer_required_phrase_markup(self):
-        text = 'This released {{{software under the MIT}}} license.'
-        required_phrase_spans = get_required_phrase_spans(text)
-        assert required_phrase_spans == [Span(2, 5)]
-
-    def test_get_required_phrases_ignores_nested_required_phrase_markup(self):
-        text = 'This released {{software {{under the}} MIT}} license.'
-        try:
-            list(get_required_phrase_spans(text))
-            raise Exception('Exception should be raised')
-        except InvalidRule:
-            pass
-
-    def test_get_required_phrase_texts_with_markup(self):
-        text = (
-            "Lua is free software distributed under the terms of the"
-            "<A HREF='http://www.opensource.org/licenses/mit-license.html'>{{MIT license}}</A>"
-            "reproduced below;"
-        )
-        required_phrase_texts = get_required_phrase_texts(text=text)
-        assert required_phrase_texts == ['mit license']
-
-    def test_get_required_phrase_spans_with_markup(self):
-        text = (
-            "Lua is free software distributed under the terms of the"
-            "<A HREF='http://www.opensource.org/licenses/mit-license.html'>{{MIT license}}</A>"
-            "reproduced below;"
-        )
-        required_phrase_spans = get_required_phrase_spans(text=text)
-        assert required_phrase_spans == [Span(18, 19)]
-
-
-class TestListOfRequiredPhrases(TestCaseClass):
+class TestIsRequiredPhraseCanSort(TestCaseClass):
 
     required_phrase_texts = [
         "mit",
@@ -117,40 +31,35 @@ class TestListOfRequiredPhrases(TestCaseClass):
         "licenses: mit",
         "MIT license",
     ]
-    required_phrases = [
-        RequiredPhraseDetails(
+    is_required_phrases = [
+        IsRequiredPhrase(
             required_phrase_text=text,
-            license_expression="mit",
-            length=len(text),
             rule=Rule(
                 license_expression="mit",
                 identifier="mit_231.RULE",
                 text=text,
                 is_required_phrase=True,
                 is_license_tag=True,
-            ),
-            sources=["mit_231.RULE"],
+            )
         )
         for text in required_phrase_texts
     ]
-    required_phrases_list = ListOfRequiredPhrases(required_phrases=required_phrases)
 
-    def test_sort_required_phrases_works(self):
-        self.required_phrases_list.sort_required_phrases()
-        expected_sorted_texts = [
+    def test_sort_is_required_phrases_works(self):
+        srps = IsRequiredPhrase.sorted(self.is_required_phrases)
+        results = [srp.required_phrase_text for srp in srps]
+
+        expected = [
             "MIT License with Disclaimer",
             "the MIT License",
             "licenses: mit",
             "MIT license",
             "mit",
         ]
-        assert [
-            required_phrase.required_phrase_text
-            for required_phrase in self.required_phrases_list.required_phrases
-        ] == expected_sorted_texts
+        assert results == expected
 
 
-class TestRequiredPhraseSpansinText:
+class TestFindPhraseInText:
 
     text_with_stopwords = (
         "A copy of the GNU General Public License is available as "
@@ -166,26 +75,27 @@ class TestRequiredPhraseSpansinText:
         "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution."
     )
 
-    def test_get_required_phrase_spans_with_or_without_specified_texts_is_same(self):
-        required_phrase_spans_specified = return_spans_for_required_phrase_in_text(
+    def test_find_phrase_spans_in_text_with_behaves_same_as_get_existing_required_phrase_spans(self):
+        spans_with_phrase = find_phrase_spans_in_text(
             text=self.text_with_stopwords,
-            required_phrase="usr share common licenses gpl 2",
+            phrase_text="usr share common licenses gpl 2",
         )
 
-        required_phrase_spans_unspecified = get_required_phrase_spans(
+        spans_with_find = get_existing_required_phrase_spans(
             text=self.text_with_stopwords_and_marked_required_phrases,
         )
-        assert required_phrase_spans_specified == required_phrase_spans_unspecified
 
-    def test_get_required_phrase_and_add_required_phrase_matches(self):
+        assert spans_with_phrase == spans_with_find
+
+    def test_find_phrase_spans_in_text_and_add_required_phrase_matches(self):
 
-        required_phrase_spans_specified = return_spans_for_required_phrase_in_text(
+        spans = find_phrase_spans_in_text(
             text=self.text_with_stopwords,
-            required_phrase="usr share common licenses gpl 2",
+            phrase_text="usr share common licenses gpl 2",
         )
 
         text = self.text_with_stopwords
-        for span in required_phrase_spans_specified:
+        for span in spans:
             text = add_required_phrase_markers(
                 text=text,
                 required_phrase_span=span,
@@ -193,12 +103,80 @@ def test_get_required_phrase_and_add_required_phrase_matches(self):
 
         assert text == self.text_with_stopwords_and_marked_required_phrases
 
+
+class TestFindSpansInText:
+
+    text_with_articles = (
+        "A copy of the GNU General Public License is available as "
+        "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution. "
+        "A copy of the GNU General Public License is available as "
+        "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution."
+    )
+
+    text_with_articles_and_marked_required_phrases = (
+        "A copy of the GNU General Public License is available as "
+        "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution. "
+        "A copy of the GNU General Public License is available as "
+        "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution."
+    )
+
+    text_with_extra_characters = (
+        "This is the http://www.opensource.org/licenses/mit-license.php MIT "
+        "Software License which is OSI-certified, and GPL-compatible."
+    )
+
+    text_with_extra_characters_and_marked_required_phrases = (
+        "This is the http://www.opensource.org/licenses/mit-license.php {{MIT "
+        "Software License}} which is OSI-certified, and GPL-compatible."
+    )
+
+    def test_find_phrase_spans_in_text(self):
+        text = "is released under the MIT license. See the LICENSE"
+        spans = find_phrase_spans_in_text(text=text, phrase_text="mit license")
+        assert spans == [Span(4, 5)]
+
+    def test_find_phrase_spans_in_text_multiple(self):
+        spans = find_phrase_spans_in_text(
+            text=self.text_with_articles,
+            phrase_text="usr share common licenses gpl 2",
+        )
+        assert spans == [Span(10, 15), Span(32, 37)]
+
+    def test_find_phrase_spans_in_text_then_add_with_multiple_spans(self):
+        spans = find_phrase_spans_in_text(
+            text=self.text_with_articles,
+            phrase_text="usr share common licenses gpl 2",
+        )
+        text = self.text_with_articles
+        for span in spans:
+            text = add_required_phrase_markers(
+                text=text,
+                required_phrase_span=span,
+            )
+
+        assert text == self.text_with_articles_and_marked_required_phrases
+
+    def test_add_required_phrase_markers_in_text_with_extra_characters(self):
+        spans = find_phrase_spans_in_text(
+            text=self.text_with_extra_characters,
+            phrase_text="mit software license",
+        )
+        text = self.text_with_extra_characters
+        for span in spans:
+            text = add_required_phrase_markers(
+                text=text,
+                required_phrase_span=span,
+            )
+
+        assert text == self.text_with_extra_characters_and_marked_required_phrases
+
+
 class TestKeyPhrasesCanBeMarked(TestCaseClass):
 
     @pytest.mark.scanslow
-    def can_more_key_phrases_be_marked_from_other_rules(self):
-        add_required_phrases_from_other_rules(can_mark_required_phrase_test=True)
+    def test_update_rules_using_is_required_phrases_rules(self):
+        update_rules_using_is_required_phrases_rules(verbose=True, _dry_run=True)
 
     @pytest.mark.scanslow
-    def can_more_key_phrases_be_marked_from_license_attribtues(self):
-        add_required_phrases_from_license_fields(can_mark_required_phrase_test=True)
+    def test_update_rules_using_license_attributes(self):
+        update_rules_using_license_attributes(verbose=True, _dry_run=True)