Add script for adding required phrases automatically

Add a script which can add required phrases in already existing rules automatically from required phrases already present in other rules and license field names. This can be done one license expression at a time. Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
aboutcode-org · Sep 2, 2024 · e3b7f3b · e3b7f3b
1 parent 745ba9c
commit e3b7f3b
Show file tree

Hide file tree

Showing 34 changed files with 1,599 additions and 330 deletions.
diff --git a/docs/source/how-to-guides/add_new_license_detection_rule.rst b/docs/source/how-to-guides/add_new_license_detection_rule.rst
@@ -73,7 +73,7 @@ More (advanced) rules options:
   be present in the result license detections. These just have the license text and a
   `is_false_positive` flag set to True.
 
-- you can specify key phrases by surrounding one or more words between the `{{`
+- you can specify required phrases by surrounding one or more words between the `{{`
   and `}}` tags. Key phrases are words that **must** be matched/present in order
   for a RULE to be considered a match.
 

diff --git a/etc/scripts/licenses/buildrules.py b/etc/scripts/licenses/buildrules.py
@@ -16,6 +16,7 @@
 from licensedcode import models
 from licensedcode import match_hash
 from licensedcode import frontmatter
+from licensedcode.models import get_rule_id_for_text
 from license_expression import Licensing
 
 """
@@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
     return rules
 
 
-def rule_exists(text):
-    """
-    Return the matched rule identifier if the text is an existing rule matched
-    exactly, False otherwise.
-    """
-    idx = cache.get_index()
-
-    matches = idx.match(query_string=text)
-    if not matches:
-        return False
-    if len(matches) > 1:
-        return False
-    match = matches[0]
-    if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
-        return match.rule.identifier
-
-
 def all_rule_by_tokens():
     """
     Return a mapping of {tuples of tokens: rule id}, with one item for each
@@ -346,7 +330,7 @@ def cli(licenses_file, dump_to_file_on_errors=False):
 
         text = rule.text
 
-        existing_rule = rule_exists(text)
+        existing_rule = get_rule_id_for_text(text)
         skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")
 
         existing_msg = (

diff --git a/etc/scripts/licenses/report_license_rules.py b/etc/scripts/licenses/report_license_rules.py
@@ -62,6 +62,8 @@
     "is_license_reference",
     "is_license_intro",
     "is_license_clue",
+    "is_required_phrase",
+    "skip_collecting_required_phrases",
     "is_deprecated",
     "has_unknown",
     "only_known_words",

diff --git a/setup-mini.cfg b/setup-mini.cfg
@@ -158,6 +158,7 @@ console_scripts =
     scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
     scancode-license-data = licensedcode.license_db:dump_scancode_license_data
     regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
+    add-required-phrases = licensedcode.required_phrases:add_required_phrases
 
 # These are configurations for ScanCode plugins as setuptools entry points.
 # Each plugin entry hast this form:

diff --git a/setup.cfg b/setup.cfg
@@ -158,6 +158,7 @@ console_scripts =
     scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
     scancode-license-data = licensedcode.license_db:dump_scancode_license_data
     regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
+    add-required-phrases = licensedcode.required_phrases:add_required_phrases
 
 # These are configurations for ScanCode plugins as setuptools entry points.
 # Each plugin entry hast this form:

diff --git a/src/licensedcode/data/rules/cclrc_1.RULE b/src/licensedcode/data/rules/cclrc_1.RULE
@@ -0,0 +1,10 @@
+---
+license_expression: cclrc
+is_license_notice: yes
+referenced_filenames:
+    - External_License/CCLRC_CDAT_License.txt
+---
+
+*    This software may be distributed under the terms of the
+ *    {{CCLRC Licence}} for CCLRC Software
+ *    <CDATDIR>/External_License/CCLRC_CDAT_License.txt
diff --git a/src/licensedcode/data/rules/cclrc_2.RULE b/src/licensedcode/data/rules/cclrc_2.RULE
@@ -0,0 +1,7 @@
+---
+license_expression: cclrc
+is_license_notice: yes
+---
+
+*    This software may be distributed under the terms of the
+ *    {{CCLRC Licence}} for CCLRC Software
diff --git a/src/licensedcode/data/rules/mit_1155.RULE b/src/licensedcode/data/rules/mit_1155.RULE
@@ -2,6 +2,7 @@
 license_expression: mit
 is_license_notice: yes
 relevance: 100
+skip_collecting_required_phrases: yes
 ---
 
 For license terms see {{SLF4J}}
diff --git a/src/licensedcode/data/rules/mit_1302.RULE b/src/licensedcode/data/rules/mit_1302.RULE
@@ -5,4 +5,4 @@ referenced_filenames:
     - LICENSE
 ---
 
-is free software: you can redistribute it and/or modify it under the terms of {{the MIT License}} as published by the Open Source Initiative. See the {{ LICENSE file }} for more details.
+is free software: you can redistribute it and/or modify it under the terms of {{the MIT License}} as published by the Open Source Initiative. See the LICENSE file for more details.
diff --git a/src/licensedcode/data/rules/mit_397.RULE b/src/licensedcode/data/rules/mit_397.RULE
@@ -5,8 +5,8 @@ relevance: 100
 referenced_filenames:
     - COPYING
 ignorable_urls:
-    - http://www.opensource.org/licenses/mit-license.php
+    - https://www.opensource.org/licenses/mit-license.php
 ---
 
 // Distributed under the MIT software license, see the accompanying
-// file COPYING or shttp://www.opensource.org/licenses/mit-license.php.
+// file COPYING or https://www.opensource.org/licenses/mit-license.php.
diff --git a/src/licensedcode/data/rules/mit_9.RULE b/src/licensedcode/data/rules/mit_9.RULE
@@ -6,4 +6,4 @@ ignorable_urls:
     - http://en.wikipedia.org/wiki/MIT_License
 ---
 
-Licensed under the MIT (http://en.wikipedia.org/wiki/MIT_License) license.
+{{Licensed under the MIT}} (http://en.wikipedia.org/wiki/MIT_License) license.
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -55,7 +55,7 @@
 TRACE_FILTER_BELOW_MIN_SCORE = False
 TRACE_FILTER_SINGLE_WORD_GIBBERISH = False
 TRACE_SET_LINES = False
-TRACE_KEY_PHRASES = False
+TRACE_REQUIRED_PHRASES = False
 TRACE_REGIONS = False
 TRACE_FILTER_LICENSE_LIST = False
 TRACE_FILTER_LICENSE_LIST_DETAILED = False
@@ -91,7 +91,7 @@ def logger_debug(*args): pass
     or TRACE_MATCHED_TEXT_DETAILS
     or TRACE_HIGHLIGHTED_TEXT
     or TRACE_FILTER_SINGLE_WORD_GIBBERISH
-    or TRACE_KEY_PHRASES
+    or TRACE_REQUIRED_PHRASES
     or TRACE_REGIONS
     or TRACE_FILTER_LICENSE_LIST
     or TRACE_FILTER_LICENSE_LIST_DETAILED
@@ -133,7 +133,7 @@ def _debug_print_matched_query_text(match, extras=5):
 
 class DiscardReason(IntEnum):
     NOT_DISCARDED = 0
-    MISSING_KEY_PHRASES = 1
+    MISSING_REQUIRED_PHRASES = 1
     BELOW_MIN_COVERAGE = 2
     SPURIOUS_SINGLE_TOKEN = 3
     TOO_SHORT = 4
@@ -634,15 +634,15 @@ def combine(self, other):
             discard_reason = DiscardReason.NOT_DISCARDED
 
         elif (
-            self.discard_reason == DiscardReason.MISSING_KEY_PHRASES
-            and other.discard_reason == DiscardReason.MISSING_KEY_PHRASES
+            self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
+            and other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
         ):
-            discard_reason = DiscardReason.MISSING_KEY_PHRASES
+            discard_reason = DiscardReason.MISSING_REQUIRED_PHRASES
 
-        elif self.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
+        elif self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
             discard_reason = other.discard_reason
 
-        elif other.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
+        elif other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
             discard_reason = self.discard_reason
 
         else:
@@ -2116,17 +2116,17 @@ def filter_false_positive_matches(
     return kept, discarded
 
 
-def filter_matches_missing_key_phrases(
+def filter_matches_missing_required_phrases(
     matches,
-    trace=TRACE_KEY_PHRASES,
-    reason=DiscardReason.MISSING_KEY_PHRASES,
+    trace=TRACE_REQUIRED_PHRASES,
+    reason=DiscardReason.MISSING_REQUIRED_PHRASES,
 ):
     """
     Return a filtered list of kept LicenseMatch matches and a list of
     discardable matches  given a ``matches`` list of LicenseMatch by removing
-    all ``matches`` that do not contain all key phrases defined in their matched
+    all ``matches`` that do not contain all required phrases defined in their matched
     rule.
-    A key phrase must be matched exactly without gaps or unknown words.
+    A required phrase must be matched exactly without gaps or unknown words.
 
     A rule with "is_continuous" set to True is the same as if its whole text
     was defined as a keyphrase and is processed here too.
@@ -2143,14 +2143,14 @@ def filter_matches_missing_key_phrases(
     discarded_append = discarded.append
 
     if trace:
-        logger_debug('filter_matches_missing_key_phrases')
+        logger_debug('filter_matches_missing_required_phrases')
 
     for match in matches:
         if trace:
             logger_debug('  CHECKING KEY PHRASES for:', match)
 
         is_continuous = match.rule.is_continuous
-        ikey_spans = match.rule.key_phrase_spans
+        ikey_spans = match.rule.required_phrase_spans
 
         if not (ikey_spans or is_continuous):
             kept_append(match)
@@ -2180,11 +2180,11 @@ def filter_matches_missing_key_phrases(
             # use whole ispan in this case
             ikey_spans = [match.ispan]
 
-        # keep matches as candidate if they contain all key phrase positions in the ispan
+        # keep matches as candidate if they contain all required phrase positions in the ispan
         if trace:
             print('    CANDIDATE TO KEEP: all ikey_span in match.ispan:', ikey_spans, ispan)
 
-        # discard matches that contain key phrases, but interrupted by
+        # discard matches that contain required phrases, but interrupted by
         # unknown or stop words.
 
         unknown_by_pos = match.query.unknowns_by_pos
@@ -2195,7 +2195,7 @@ def filter_matches_missing_key_phrases(
         istopwords_by_pos = match.rule.stopwords_by_pos
         istopwords_by_pos_get = istopwords_by_pos.get
 
-        # iterate on each key phrase span to ensure that they are continuous
+        # iterate on each required phrase span to ensure that they are continuous
         # and contain no unknown words on the query side
 
         is_valid = True
@@ -2204,7 +2204,7 @@ def filter_matches_missing_key_phrases(
 
         for ikey_span in ikey_spans:
 
-            # check that are no gaps in the key phrase span on the query side
+            # check that are no gaps in the required phrase span on the query side
             # BUT, do not redo the check for is_continuous already checked above
             if is_continuous:
                 qkey_span = qspan
@@ -2225,13 +2225,13 @@ def filter_matches_missing_key_phrases(
                     is_valid = False
                     break
 
-            # check that key phrase spans does not contain stop words and does
+            # check that required phrase spans does not contain stop words and does
             # not contain unknown words
 
-            # NOTE: we do not check the last qkey_span position of a key phrase
+            # NOTE: we do not check the last qkey_span position of a required phrase
             # since unknown is a number of words after a given span position:
             # these are pinned to the last position and we would not care for
-            # what unknown or stop words show up after a key phrase ends.
+            # what unknown or stop words show up after a required phrase ends.
 
             qkey_span_end = qkey_span.end
             contains_unknown = any(
@@ -2694,7 +2694,7 @@ def _log(_matches, _discarded, msg):
     # FIXME: we should have only a single loop on all the matches at once!!
     # and not 10's of loops!!!
 
-    matches, discarded = filter_matches_missing_key_phrases(matches)
+    matches, discarded = filter_matches_missing_required_phrases(matches)
     all_discarded_extend(discarded)
     _log(matches, discarded, 'HAS KEY PHRASES')