diff --git a/setup.cfg b/setup.cfg
index 5aa773107b..1c8df20664 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -159,6 +159,7 @@ console_scripts =
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases
+ gen-new-required-phrases-rules = licensedcode.required_phrases:gen_required_phrases_rules
# These are configurations for ScanCode plugins as setuptools entry points.
# Each plugin entry hast this form:
diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
index b27e7503c5..7866d56632 100644
--- a/src/licensedcode/match.py
+++ b/src/licensedcode/match.py
@@ -2129,12 +2129,14 @@ def filter_matches_missing_required_phrases(
A required phrase must be matched exactly without gaps or unknown words.
A rule with "is_continuous" set to True is the same as if its whole text
- was defined as a keyphrase and is processed here too.
+ was defined as a required phrase and is processed here too.
+ Same for a rule with "is_required_phrase" set to True.
+
"""
- # never discard a solo match, unless matched to "is_continuous" rule
+ # never discard a solo match, unless matched to "is_continuous" or "is_required_phrase" rule
if len(matches) == 1:
rule = matches[0]
- if not rule.is_continuous:
+ if not (rule.is_continuous or rule.is_required_phrase):
return matches, []
kept = []
@@ -2149,7 +2151,7 @@ def filter_matches_missing_required_phrases(
if trace:
logger_debug(' CHECKING KEY PHRASES for:', match)
- is_continuous = match.rule.is_continuous
+ is_continuous = match.rule.is_continuous or match.rule.is_required_phrase
ikey_spans = match.rule.required_phrase_spans
if not (ikey_spans or is_continuous):
diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py
index ccc96f0ddf..32035bb060 100644
--- a/src/licensedcode/required_phrases.py
+++ b/src/licensedcode/required_phrases.py
@@ -8,183 +8,84 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#
+import re
+
+from collections import defaultdict
+
import attr
-import os
import click
+from commoncode.cliutils import PluggableCommandLineOption
from license_expression import Licensing
-from licensedcode import TINY_RULE
-from commoncode.cliutils import PluggableCommandLineOption
+from licensedcode.cache import build_index
+from licensedcode.cache import get_index
+from licensedcode.cache import get_licenses_db
+from licensedcode.models import find_rule_base_location
+from licensedcode.models import get_ignorables
+from licensedcode.models import get_normalized_ignorables
from licensedcode.models import get_rules_by_expression
-from licensedcode.models import load_licenses
from licensedcode.models import load_rules
-from licensedcode.models import InvalidRule
from licensedcode.models import rules_data_dir
from licensedcode.models import Rule
from licensedcode.models import rule_exists
-from licensedcode.models import find_rule_base_location
-
+from licensedcode.models import update_ignorables
from licensedcode.spans import Span
-from licensedcode.tokenize import required_phrase_tokenizer
-from licensedcode.tokenize import index_tokenizer
-from licensedcode.tokenize import return_spans_for_required_phrase_in_text
-from licensedcode.tokenize import get_ignorable_spans
-from licensedcode.tokenize import get_non_overlapping_spans
-from licensedcode.tokenize import add_required_phrase_markers
-from licensedcode.tokenize import REQUIRED_PHRASE_OPEN
+from licensedcode.stopwords import STOPWORDS
from licensedcode.tokenize import REQUIRED_PHRASE_CLOSE
-from licensedcode.tokenize import get_normalized_tokens
-
-
-# Add the rule identifier here to trace required phrase collection or required
-# phrase marking for a specific rule (Example: "mit_12.RULE")
-TRACE_REQUIRED_PHRASE_FOR_RULES = []
-
-
-def get_required_phrase_spans(text):
- """
- Return a list of Spans representin required phrase token positions in the text
- for each required phrase found in the rule ``text``.
+from licensedcode.tokenize import REQUIRED_PHRASE_OPEN
+from licensedcode.tokenize import required_phrase_tokenizer
+from licensedcode.tokenize import matched_query_text_tokenizer
+from licensedcode.tokenize import get_existing_required_phrase_spans
- For example:
+"""
+This is a utility module for "required phrases".
+This is a designed to run as a command line tool with extensive debugging and tracing facilitues.
- >>> text = 'This is enclosed in {{double curly braces}}'
- >>> # 0 1 2 3 4 5 6
- >>> x = get_required_phrase_spans(text)
- >>> assert x == [Span(4, 6)], x
+Usage:
- >>> text = 'This is {{enclosed}} a {{double curly braces}} or not'
- >>> # 0 1 2 SW 3 4 5 6 7
- >>> x = get_required_phrase_spans(text)
- >>> assert x == [Span(2), Span(3, 5)], x
+- start with gen-new-required-phrases-rules: this will create new rules from existing "required
+phrases" found in rules.
- >>> text = 'This {{is}} enclosed a {{double curly braces}} or not'
- >>> # 0 1 2 SW 3 4 5 6 7
- >>> x = get_required_phrase_spans(text)
- >>> assert x == [Span([1]), Span([3, 4, 5])], x
+- regen the index
- >>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}'
- >>> # 0 1 2 3 4 5 6 7 8 9
- >>> x = get_required_phrase_spans(text)
- >>> assert x == [Span(0, 9)], x
+- then continue with add-required-phrases to update existing rules with required phrases found in
+"is_required_phrase" rules and license attributes/fields.
- >>> assert get_required_phrase_spans('{This}') == []
+"""
- >>> def check_exception(text):
- ... try:
- ... return get_required_phrase_spans(text)
- ... except InvalidRule:
- ... pass
+# Add rule identifiers here to trace required phrase collection or required
+# phrase marking for a specific rule (Example: "mit_12.RULE")
+TRACE_REQUIRED_PHRASE_FOR_RULES = []
- >>> check_exception('This {{is')
- >>> check_exception('This }}is')
- >>> check_exception('{{This }}is{{')
- >>> check_exception('This }}is{{')
- >>> check_exception('{{}}')
- >>> check_exception('{{This is')
- >>> check_exception('{{This is{{')
- >>> check_exception('{{This is{{ }}')
- >>> check_exception('{{{{This}}}}')
- >>> check_exception('}}This {{is}}')
- >>> check_exception('This }} {{is}}')
- >>> check_exception('{{This}}')
- [Span(0)]
- >>> check_exception('{This}')
- []
- >>> check_exception('{{{This}}}')
- [Span(0)]
- """
- return [
- required_phrase.span
- for required_phrase in get_required_phrases(text)
- ]
+####################################################################################################
+#
+# Shared utilities
+#
+####################################################################################################
-def get_required_phrase_texts(text):
+def get_normalized_tokens(text, skip_required_phrase_markers=True, preserve_case=False):
"""
- Return a list of required phrase texts for each required phrase found
- in the rule ``text``.
-
- For example:
-
- >>> text = 'This is enclosed in {{double curly braces}}'
- >>> # 0 1 2 3 4 5 6
- >>> x = get_required_phrase_texts(text=text)
- >>> assert x == ['double curly braces'], x
+ Return a list of normalized token strings in ``text``.
"""
- return [
- required_phrase.text
- for required_phrase in get_required_phrases(text)
- ]
-
-
-@attr.s
-class RequiredPhraseInText:
-
- required_phrase_positions = attr.ib(
- default=attr.Factory(list),
- repr=False,
- metadata=dict(help='List of positions of a required phrase in a rule text.')
- )
-
- required_phrase_tokens = attr.ib(
- default=attr.Factory(list),
- metadata=dict(help='List of required phrase tokens for this rule.')
- )
-
- @property
- def text(self):
- """The full normalized text for this required phrase, built from its tokens."""
- return " ".join(self.required_phrase_tokens)
-
- @property
- def span(self):
- """A span representing the position of this required phrase in a rule text."""
- return Span(self.required_phrase_positions)
-
- def update(self, token, ipos):
- self.required_phrase_tokens.append(token)
- self.required_phrase_positions.append(ipos)
-
-
-def get_required_phrases(text):
- """
- Yield RequiredPhraseInText objects with both required phrase positions
- and lists of tokens for each required phrase found in the rule ``text``.
- Tokens form a required phrase when enclosed in {{double curly braces}}.
- """
- ipos = 0
- in_required_phrase = False
- required_phrase = RequiredPhraseInText()
- for token in required_phrase_tokenizer(text):
- if token == REQUIRED_PHRASE_OPEN:
- if in_required_phrase:
- raise InvalidRule('Invalid rule with nested required phrase {{ {{ braces', text)
- in_required_phrase = True
-
- elif token == REQUIRED_PHRASE_CLOSE:
- if in_required_phrase:
- if required_phrase.required_phrase_tokens:
- yield required_phrase
- required_phrase = RequiredPhraseInText()
- else:
- raise InvalidRule('Invalid rule with empty required phrase {{}} braces', text)
- in_required_phrase = False
- else:
- raise InvalidRule(f'Invalid rule with dangling required phrase missing closing braces', text)
- continue
- else:
- if in_required_phrase:
- required_phrase.update(token=token, ipos=ipos)
- ipos += 1
-
- if required_phrase.required_phrase_tokens or in_required_phrase:
- raise InvalidRule(f'Invalid rule with dangling required phrase missing final closing braces', text)
+ required_phrase_markers = [REQUIRED_PHRASE_CLOSE, REQUIRED_PHRASE_OPEN]
+ tokens = list(required_phrase_tokenizer(text=text, preserve_case=preserve_case))
+ if skip_required_phrase_markers:
+ tokens = [
+ token
+ for token in tokens
+ if token not in required_phrase_markers
+ ]
+ return tokens
def get_normalized_text(text, skip_required_phrase_markers=True):
+ """
+ Return the normalized text for ``text``. Optionally ``skip_required_phrase_markers`` double
+ {{curly braces}}.
+ """
return " ".join(
get_normalized_tokens(
text=text,
@@ -193,414 +94,290 @@ def get_normalized_text(text, skip_required_phrase_markers=True):
)
-def get_num_tokens(text):
- return len(get_normalized_tokens(text))
-
-def is_text_license_reference(text):
-
- tokens = list(index_tokenizer(text=text))
- words_license_reference = ['http', 'https', 'io', 'com', 'txt', 'md', 'file']
- if any(
- True
- for word in words_license_reference
- if word in tokens
+def find_phrase_spans_in_text(text, phrase_text, preserve_case=False):
+ """
+ Return a list of Spans where the ``phrase_text`` exists in ``text``, or an empty list.
+ """
+ spans_with_required_phrase = []
+
+ text_tokens = list(get_normalized_tokens(
+ text=text,
+ preserve_case=preserve_case,
+ skip_required_phrase_markers=True,
+ ))
+ required_phrase_tokens = list(get_normalized_tokens(
+ text=phrase_text,
+ preserve_case=preserve_case,
+ skip_required_phrase_markers=True,
+ ))
+ required_phrase_first_token = required_phrase_tokens[0]
+
+ # Initial check to see if all tokens in the required phrase are present
+ if all(
+ required_phrase_token in text_tokens
+ for required_phrase_token in required_phrase_tokens
):
- return True
+ start_positions = [
+ i
+ for i, x in enumerate(text_tokens)
+ if x == required_phrase_first_token
+ ]
- return False
+ for start_pos in start_positions:
+ end_pos = start_pos + len(required_phrase_tokens)
+ if (
+ end_pos <= len(text_tokens)
+ and text_tokens[start_pos:end_pos] == required_phrase_tokens
+ ):
+ spans_with_required_phrase.append(Span(start_pos, end_pos - 1))
-@attr.s
-class RequiredPhraseDetails:
+ return spans_with_required_phrase
- license_expression = attr.ib(
- default=None,
- metadata=dict(
- help='A license expression string for this particular required phrase.')
- )
- rule = attr.ib(
- default=None,
- metadata=dict(
- help='The Rule object for this particular required phrase rule.')
- )
+def get_non_overlapping_spans(old_required_phrase_spans, new_required_phrase_spans):
+ """
+ Given two list of spans `old_required_phrase_spans` and `new_required_phrase_spans`,
+ return all the spans in `new_required_phrase_spans` that do not overlap with any
+ of the spans in `old_required_phrase_spans`.
- required_phrase_text = attr.ib(
- default=None,
- metadata=dict(
- help='Normalized required phrase text.')
- )
+ The list of spans `old_required_phrase_spans` contains all the spans of required
+ phrases or ignorables already present in a rule text, and the other list of spans
+ `new_required_phrase_spans` contains the proposed new required phrases.
+ """
+ for new_span in new_required_phrase_spans:
+ if old_required_phrase_spans:
+ if any(old_span.overlap(new_span) != 0 for old_span in old_required_phrase_spans):
+ continue
- sources = attr.ib(
- default=attr.Factory(list),
- metadata=dict(
- help='List of all rule identifiers where this required phrase is present.'
- )
- )
+ yield new_span
- length = attr.ib(
- default=0,
- metadata=dict(
- help='Length of text for this required phrase text (used to sort).'
- )
- )
- # Generic licenses should not be dumped as required phrase rules
- has_generic_license = attr.ib(
- default=False,
- metadata=dict(
- help='Has a generic license key in its license expression'
- )
- )
+def add_required_phrase_markers(text, required_phrase_span):
+ """
+ Given a ``text`` and a ``required_phrase_span`` Span, add required phrase
+ curly brace markers to the ``text`` before the start and after the of the span.
+ This is taking care of whitespace and stopwords.
+ """
+ tokens_tuples_with_markers = []
+ token_index = 0
- @classmethod
- def create_required_phrase_details(
- cls,
- license_expression,
- required_phrase_text,
- sources,
- length,
- has_generic_license=False,
- ):
+ for token_tuple in matched_query_text_tokenizer(text):
- base_name = f"{license_expression}_required_phrase"
- base_loc = find_rule_base_location(name_prefix=base_name)
- file_path = f"{base_loc}.RULE"
- identifier = file_path.split('/')[-1]
+ is_word, token = token_tuple
- normalized_text = get_normalized_text(required_phrase_text)
+ if is_word and token.lower() not in STOPWORDS:
+ if token_index == required_phrase_span.start:
+ tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_OPEN))
- rule = Rule(
- license_expression=license_expression,
- identifier=identifier,
- text=normalized_text,
- is_required_phrase=True,
- )
- if is_text_license_reference(required_phrase_text):
- rule.is_license_reference = True
- else:
- rule.is_license_tag = True
+ token_index += 1
- if not has_generic_license:
- rule.dump(rules_data_dir)
+ tokens_tuples_with_markers.append(token_tuple)
- return cls(
- license_expression=license_expression,
- rule=rule,
- required_phrase_text=normalized_text,
- sources=sources,
- length=length,
- has_generic_license=has_generic_license,
- )
+ if is_word and token.lower() not in STOPWORDS:
+ if token_index == required_phrase_span.end + 1:
+ tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_CLOSE))
- def update_sources(self, source_identifier):
- if not source_identifier in self.sources:
- self.sources.append(source_identifier)
+ return combine_tokens(tokens_tuples_with_markers)
-@attr.s
-class ListOfRequiredPhrases:
+def combine_tokens(token_tuples):
+ """
+ Returns a string `combined_text` combining token tuples from the list `token_tuples`,
+ which are token tuples created by the tokenizer functions.
+ """
+ return ''.join(token for _, token in token_tuples)
- required_phrases = attr.ib(
- default=attr.Factory(list),
- metadata=dict(
- help='A list of RequiredPhraseDetails objects for all the required phrases.')
- )
- def match_required_phrase_present(self, required_phrase_text):
- """
- Check if a required_phrase_text is present in the list of required_phrases
- or it is a rule in the index.
- Note: Order is important, as the list of required_phrases has both new rules which are
- not yet in the index and old rules also present in the index.
- """
- normalized_text = get_normalized_text(required_phrase_text)
+@attr.s
+class IsRequiredPhrase:
+ """
+ Represent a required phrase text and rule from an "is_required_phrase" Rule
+ """
- # check if this required_phrase_text is present in the collected list of required phrases
- for required_phrase in self.required_phrases:
- if required_phrase.required_phrase_text == normalized_text:
- rule = required_phrase.rule
- return rule
+ rule = attr.ib(metadata=dict(help='Rule that contains this phrase'))
+ required_phrase_text = attr.ib(metadata=dict(help='Normalized required phrase text.'))
- # check if this required_phrase_text is present as a rule in the index
- rule = rule_exists(text=required_phrase_text)
- if rule:
- return rule
+ @property
+ def license_expression(self):
+ self.rule.license_expression
- def update_required_phrase_sources(self, rule, has_generic_license=False, different_license=False):
+ @staticmethod
+ def sorted(isrequiredphrases):
"""
- Given a rule update the required phrases list with this rule
-
- Note: this should only be called on a rule that is obtained from the
- match_required_phrase_present function so that the rule is present in the
- index/required phrases list.
+ Return an ``isrequiredphrases`` list of IsRequiredPhrase sorted by decreasing text length.
"""
- # if rule is present as a required phrase rule in the list then
- # add identifier to sources of the required phrase rule
- for required_phrase in self.required_phrases:
- if required_phrase.rule.identifier == rule.identifier:
- required_phrase.update_sources(rule.identifier)
- return
-
- if rule and (rule.is_license_intro or rule.is_license_clue):
- return
-
- # if rule is present as a rule in the index, set the is_required_phrase flag
- # and add to the list of required phrase rules, if it is a non-generic license of
- # the same license expression
- if not rule.is_required_phrase and not has_generic_license and not different_license:
- rule.is_required_phrase = True
- rule.dump(rules_data_dir)
+ sorter = lambda p: (len(p.rule.text), p.required_phrase_text)
+ return sorted(isrequiredphrases, key=sorter, reverse=True)
- normalized_text = get_normalized_text(rule.text)
- required_phrase_detail = RequiredPhraseDetails(
- license_expression=rule.license_expression,
- rule=rule,
- required_phrase_text=normalized_text,
- sources=[rule.identifier],
- length=len(normalized_text),
- has_generic_license=has_generic_license,
- )
- self.required_phrases.append(required_phrase_detail)
-
- def sort_required_phrases(self):
- self.required_phrases = sorted(
- self.required_phrases,
- key=lambda x: x.length,
- reverse=True,
- )
- def add_variations_of_required_phrases(self, licenses_by_key):
-
- words_to_skip = ["the"]
- for required_phrase in self.required_phrases:
- required_phrase_tokens = list(index_tokenizer(text=required_phrase.required_phrase_text))
- skip_words_present = [
- skip_word
- for skip_word in words_to_skip
- if skip_word in required_phrase_tokens
- ]
- for skip_word in skip_words_present:
- required_phrase_tokens.remove(skip_word)
- required_phrase_without_skip_word = " ".join(required_phrase_tokens)
- matched_rule = self.match_required_phrase_present(required_phrase_without_skip_word)
- if matched_rule and matched_rule.skip_collecting_required_phrases:
- continue
-
- has_generic_license = does_have_generic_licenses(
- license_expression=required_phrase.license_expression,
- licenses_by_key=licenses_by_key,
- )
- if not matched_rule:
- required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details(
- license_expression=required_phrase.license_expression,
- required_phrase_text=required_phrase_without_skip_word,
- sources=[required_phrase.rule.identifier],
- length=len(required_phrase_without_skip_word),
- has_generic_license=has_generic_license,
- )
- self.required_phrases.append(required_phrase_detail)
- else:
- self.update_required_phrase_sources(
- rule=matched_rule,
- has_generic_license=has_generic_license,
- )
-
-
-def does_have_generic_licenses(license_expression, licenses_by_key):
- licensing = Licensing()
- license_keys = licensing.license_keys(license_expression)
- has_generic_license = False
- for lic_key in license_keys:
- lic = licenses_by_key.get(lic_key)
- if lic and (
- lic.is_generic or lic.is_unknown
- ):
- has_generic_license = True
- break
-
- return has_generic_license
-
-
-def collect_required_phrases_in_rules(
- rules_by_expression,
- licenses_by_key,
- license_expression=None,
- verbose=False,
-):
-
- # A mapping of {license_expression: ListOfRequiredPhrases} for all applicable
- # license_expressions
- required_phrases_by_expression = {}
-
- licensing = Licensing()
+def collect_is_required_phrase_from_rules(rules_by_expression, verbose=False):
+ """
+ Return a mapping of ``{license_expression: list of [IsRequiredPhrase, ...]`` collecting the
+ texts of all rules in the ``rules_by_expression`` mapping if the "is_required_phrase" is True..
+ """
+ is_required_phrases_by_expression = {}
- # collect and create required phrase rules
for license_expression, rules in rules_by_expression.items():
-
- license_keys = licensing.license_keys(license_expression)
- if len(license_keys) != 1:
- continue
-
if verbose:
click.echo(f'Collecting required phrases for license_expression: {license_expression}')
- required_phrases_list = ListOfRequiredPhrases()
+ is_required_phrases = []
for rule in rules:
- if rule.skip_collecting_required_phrases:
+ if not rule.is_required_phrase:
continue
- if rule.is_license_intro or rule.is_license_clue:
- continue
-
- for required_phrase_text in get_required_phrase_texts(rule.text):
- if get_num_tokens(required_phrase_text) < 2:
- if verbose:
- click.echo(f'WARNING: single word required phrases in: {rule.identifier}, skipping.')
- continue
-
- required_phrase_rule = required_phrases_list.match_required_phrase_present(
- required_phrase_text=required_phrase_text,
- )
-
- debug = False
- if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
- debug = True
- click.echo(
- f"Collecting from rule: {rule.identifier} "
- f"Required phrase: '{required_phrase_text}' "
- f"Matched rule: {required_phrase_rule}"
- )
-
- if required_phrase_rule and required_phrase_rule.skip_collecting_required_phrases:
- continue
-
- has_generic_license = does_have_generic_licenses(
- license_expression=license_expression,
- licenses_by_key=licenses_by_key,
- )
- if required_phrase_rule:
- different_license = required_phrase_rule.license_expression != license_expression
- required_phrases_list.update_required_phrase_sources(
- rule=required_phrase_rule,
- has_generic_license=has_generic_license,
- different_license=different_license,
- )
- if debug:
- click.echo(f"Old required phrase updated, same license expression")
-
- elif not is_text_license_reference(required_phrase_text):
- required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details(
- license_expression=license_expression,
- required_phrase_text=required_phrase_text,
- sources=[rule.identifier],
- length=len(required_phrase_text),
- has_generic_license=has_generic_license,
- )
- required_phrases_list.required_phrases.append(required_phrase_detail)
- if debug:
- click.echo(f"New required phrase : {required_phrase_detail} ")
- elif debug:
- is_reference = is_text_license_reference(required_phrase_text)
- click.echo(f"is_text_license_reference: {is_reference} ")
+ if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
+ click.echo(f"Collecting required phrase from rule: {rule.identifier}: {rule.text!r}")
- # Add add new variations of the required phrases already present in the list
- required_phrases_list.add_variations_of_required_phrases(licenses_by_key)
+ is_required_phrases.append(IsRequiredPhrase(rule=rule, required_phrase_text=rule.text))
- # We need to sort required phrases by length so we look for and mark the longest possible
- # required phrases before the shorter ones contained in the same (substrings)
- required_phrases_list.sort_required_phrases()
- required_phrases_by_expression[license_expression] = required_phrases_list
+ # We need to sort required phrases by decreasing length so we look for and mark the longest
+ # possible required phrases before the shorter ones contained in the same text
+ is_required_phrases = IsRequiredPhrase.sorted(is_required_phrases)
+ is_required_phrases_by_expression[license_expression] = is_required_phrases
if verbose:
- count = len(required_phrases_list.required_phrases)
- texts_with_source = {
- required_phrase.required_phrase_text: required_phrase.sources
- for required_phrase in required_phrases_list.required_phrases
- }
+ count = len(is_required_phrases)
click.echo(f'Collected {count} required phrases for license_expression: {license_expression}')
click.echo('Collected required phrases texts: ')
- for text, sources in texts_with_source.items():
- click.echo(f'{text}: {sources}')
+ for rqph in is_required_phrases:
+ click.echo(f' {rqph.required_phrase_text!r}: {rqph.rule.identifier}')
- return required_phrases_by_expression
+ return is_required_phrases_by_expression
-def update_required_phrases_from_other_rules(
+def update_required_phrases_in_rules(
required_phrases_by_expression,
rules_by_expression,
- write_required_phrases=False,
+ write_phrase_source=False,
verbose=False,
+ dry_run=False,
):
-
- # add required phrases to rules from other rules
+ """
+ Update the text of rules in a ``rules_by_expression`` mapping with required phrases from the
+ ``required_phrases_by_expression`` mapping.
+ If ``write_phrase_source`` is True, include debug information in the saved rule source field.
+ """
for license_expression, rules in rules_by_expression.items():
- if not license_expression in required_phrases_by_expression:
+ if license_expression not in required_phrases_by_expression:
continue
if verbose:
click.echo(f'marking required phrases in rule texts for license_expression: {license_expression}')
- required_phrases_for_expression = required_phrases_by_expression.get(license_expression)
- add_required_phrases_for_required_phrases(
+ required_phrases = required_phrases_by_expression.get(license_expression)
+ if not required_phrases:
+ continue
+
+ add_required_phrases_to_rules_text(
+ required_phrases=required_phrases,
rules=rules,
- required_phrases=required_phrases_for_expression.required_phrases,
- verbose=verbose,
+ write_phrase_source=write_phrase_source,
+ dry_run=dry_run,
)
- if write_required_phrases:
- for license_expression, required_phrases_list in required_phrases_by_expression.items():
- if verbose:
- click.echo(f'Writing required phrases sources for license_expression: {license_expression}')
-
- for required_phrase_detail in required_phrases_list.required_phrases:
- if (
- required_phrase_detail.sources and required_phrase_detail.rule.is_required_phrase
- and not required_phrase_detail.has_generic_license
- ):
- required_phrase_detail.rule.dump(
- rules_data_dir=rules_data_dir,
- sources=required_phrase_detail.sources
- )
-
-def add_required_phrases_from_other_rules(
- licenses_by_key,
+def update_rules_using_is_required_phrases_rules(
license_expression=None,
- write_required_phrases=False,
+ write_phrase_source=False,
verbose=False,
- can_mark_required_phrase_test=False,
+ dry_run=False,
):
+ """
+ Add required phrases to rules using is_required_phrase rules.
+ Optionally filter rules with ``license_expression``.
+ """
+ rules_by_expression = get_base_rules_by_expression(license_expression=license_expression)
- rules_by_expression = get_rules_by_expression()
- if license_expression:
- rules_by_expression = {license_expression: rules_by_expression[license_expression]}
- else:
- rules_by_expression = rules_by_expression
-
- required_phrases_by_expression = collect_required_phrases_in_rules(
- license_expression=license_expression,
+ required_phrases_by_expression = collect_is_required_phrase_from_rules(
rules_by_expression=rules_by_expression,
verbose=verbose,
- licenses_by_key=licenses_by_key,
)
+ if verbose:
+ click.echo(f"update_rules_using_is_required_phrases_rules: required_phrases_by_expression # {len(required_phrases_by_expression)}")
+
+ rules_by_expression = get_updatable_rules_by_expression(
+ license_expression,
+ simple_expression=False,
+ )
+ if verbose:
+ click.echo(f"update_rules_using_is_required_phrases_rules: rules_by_expression # {len(rules_by_expression)}")
- update_required_phrases_from_other_rules(
+ update_required_phrases_in_rules(
required_phrases_by_expression=required_phrases_by_expression,
rules_by_expression=rules_by_expression,
- write_required_phrases=write_required_phrases,
+ write_phrase_source=write_phrase_source,
verbose=verbose,
+ dry_run=dry_run,
)
-def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=False):
+def get_base_rules_by_expression(license_expression=None):
+ """
+ Return a mapping of rules_by_expression, filtered for an optional ``license_expression``.
+ """
+ rules_by_expression = get_rules_by_expression()
+ if license_expression:
+ rules_by_expression = {license_expression: rules_by_expression[license_expression]}
- for rule in rules:
- # skip small or required phrase rules
- if len(rule.text) < TINY_RULE or rule.is_required_phrase:
- continue
+ return rules_by_expression
+
+
+def get_updatable_rules_by_expression(license_expression=None, simple_expression=True):
+ """
+ Return a mapping of rules_by_expression, filtered for an optional ``license_expression``.
+ The rules are suitable to receive required phrase updates
+ If simple_expression is True, only consider lincense rules with a single license key.
+ """
+ rules_by_expression = get_base_rules_by_expression()
+
+ index = get_index()
+ licensing = Licensing()
+
+ updatable_rules_by_expression = {}
+
+ # filter rules to keep only updatable rules
+ for expression, rules in rules_by_expression.items():
+ if simple_expression:
+ license_keys = licensing.license_keys(license_expression)
+ if len(license_keys) != 1:
+ continue
+
+ updatable_rules = []
+ for rule in rules:
+ # skip required phrase, false positive, tiny and and more
+ if rule.is_required_phrase or not rule.is_approx_matchable:
+ continue
+ # skip rules that ask to be skipped
+ if rule.skip_for_required_phrase_generation:
+ continue
+
+ # skip non-approx matchable, they will be matche exactly
+ if not index.is_rule_approx_matchable(rule):
+ continue
+
+ updatable_rules.append(rule)
+
+ if updatable_rules:
+ updatable_rules_by_expression[expression] = updatable_rules
+
+ return updatable_rules_by_expression
+
+
+def add_required_phrases_to_rules_text(
+ required_phrases,
+ rules,
+ write_phrase_source=False,
+ dry_run=False,
+):
+ """
+ Add the ``required_phrases`` list of IsRequiredPhrase to each rule in a ``rules`` list of
+ license Rule.
+ """
+ for rule in rules:
for required_phrase in required_phrases:
debug = False
if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
@@ -610,47 +387,96 @@ def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=F
)
debug = True
+ source = rule.source or ""
+ if write_phrase_source:
+ source += f" {required_phrase.rule.identifier}"
+
add_required_phrase_to_rule(
rule=rule,
required_phrase=required_phrase.required_phrase_text,
- debug_data=required_phrase.sources,
+ source=source,
debug=debug,
+ dry_run=dry_run,
)
-def add_required_phrases_for_license_fields(licence_object, rules, verbose=False):
+def add_license_attributes_as_required_phrases_to_rules_text(
+ license_object,
+ rules,
+ write_phrase_source=False,
+ dry_run=False,
+):
+ """
+ Add new required phrases to the ``rules`` list of Rule using the ``license_object`` License
+ fields for required phrases.
+ """
license_fields_mapping_by_order = {
- "name": licence_object.name,
- "short_name": licence_object.short_name,
- #"key",
- #"spdx_license_key"
+ "name": license_object.name,
+ "short_name": license_object.short_name,
+ # "key",
+ # "spdx_license_key",
}
for rule in rules:
- # skip small rules
- if len(rule.text) < TINY_RULE:
- continue
+ for field_name, required_phrase_text in license_fields_mapping_by_order.values():
+ debug = False
+ if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES:
+ click.echo(
+ f"Updating rule: {rule.identifier} "
+ f"with required phrase from license: {field_name!r}: {required_phrase_text!r}."
+ )
+ debug = True
- for license_field_value in license_fields_mapping_by_order.values():
- add_required_phrase_to_rule(rule=rule, required_phrase=license_field_value)
+ source = rule.source or ""
+ if write_phrase_source:
+ source += f" {license_object.key}.LICENSE : {field_name}"
+ add_required_phrase_to_rule(
+ rule=rule,
+ required_phrase=required_phrase_text,
+ source=source,
+ debug=debug,
+ dry_run=dry_run,
+ )
+
+
+def get_ignorable_spans(rule):
+ """
+ Return a list of ignorable Spans for the ``rule``.
+ Ignorable spans are for URLs and referenced filenames present in a rule text. These should not
+ be messed up with when injecting new required phrases in a rule text.
+ """
+ ignorable_spans = []
+ ignorables = rule.referenced_filenames + rule.ignorable_urls
+ for ignorable in ignorables:
+ ignorable_spans.extend(
+ find_phrase_spans_in_text(
+ text=rule.text,
+ required_phrase=ignorable,
+ preserve_case=True,
+ )
+ )
-def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=False):
+ return ignorable_spans
- # Reload from file as there could be changes from other license fields
- rule_file = os.path.join(rules_data_dir, rule.identifier)
- reloaded_rule = Rule.from_file(rule_file)
- # we get spans for name/short_name if they exist
- new_required_phrase_spans = return_spans_for_required_phrase_in_text(
- text=reloaded_rule.text,
+def add_required_phrase_to_rule(rule, required_phrase, source, debug=False, dry_run=False):
+ """
+ Update and save the ``rule`` Rule tagging the text with the ``required_phrase`` text. Skip
+ updating and saving the rule to disk under some conditions, like if ignorables would be changed.
+ Return True if the rule was updated and False otherwise.
+ """
+
+ # These are candidate spans for new requriedf_phrases, if they exist
+ new_required_phrase_spans = find_phrase_spans_in_text(
+ text=rule.text,
required_phrase=required_phrase,
)
# we get spans for already existing required phrases and ignorables
- ignorable_spans = get_ignorable_spans(reloaded_rule)
- old_required_phrase_spans = get_required_phrase_spans(reloaded_rule.text)
+ ignorable_spans = get_ignorable_spans(rule)
+ old_required_phrase_spans = get_existing_required_phrase_spans(rule.text)
# we verify whether there are spans which overlap with the
# already present required phrases or ignorables
@@ -669,120 +495,165 @@ def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=Fa
ignorable_debug = rule.referenced_filenames + rule.ignorable_urls
click.echo(f"debug ignorables: {ignorable_debug}")
- text_rule = reloaded_rule.text
-
# we add required phrase markers for the non-overlapping spans
+ new_rule_text = rule.text
for span_to_add in spans_to_add:
- text_rule = add_required_phrase_markers(
- text=text_rule,
+ new_rule_text = add_required_phrase_markers(
+ text=new_rule_text,
required_phrase_span=span_to_add,
)
# write the rule on disk if there are any updates
- if text_rule != reloaded_rule.text:
+ if new_rule_text == rule.text:
+ return False
+
+ if has_ignorable_changes(rule=rule, updated_text=new_rule_text):
if debug:
click.echo(
- f"Updating rule: {reloaded_rule.identifier} "
+ f"NOT Updating rule: {rule.identifier} "
+ f"because IGNORABLES would change "
f"with required phrase: {required_phrase} "
- f"debug data: {debug_data} /n"
)
- reloaded_rule.text = text_rule
- reloaded_rule.dump(rules_data_dir)
+ return False
-def add_required_phrases_from_license_fields(
- licenses_by_key,
- license_expression=None,
- verbose=False,
- can_mark_required_phrase_test=False,
-):
+ rule.source = source or None
+ rule.text = new_rule_text
+ if not dry_run:
+ if debug:
+ click.echo(
+ f"UPDATE: Updating rule: {rule.identifier} "
+ f"with required phrase: {required_phrase!r} "
+ f"source: {source!r}"
+ )
+ rule.dump(rules_data_dir)
+ return True
+
+
+def has_ignorable_changes(rule, updated_text):
"""
- For all rules with the `license_expression`, add required phrases from the
- license fields.
+ Return True if there would be changes in the "ignorable_*" attributes of a ``rule`` Rule if its
+ text was to be updated with a new ``updated_text``.
"""
- rules_by_expression = get_rules_by_expression()
+ existing_ignorables = get_normalized_ignorables(rule)
+ updated_ignorables = get_ignorables(updated_text)
+ return existing_ignorables != updated_ignorables
- if license_expression:
- rules_by_expression_to_update = {license_expression: rules_by_expression[license_expression]}
- else:
- rules_by_expression_to_update = rules_by_expression
- licensing = Licensing()
+def update_rules_using_license_attributes(
+ license_expression=None,
+ write_phrase_source=False,
+ verbose=False,
+ dry_run=False,
+):
+ """
+ Add required phrases found in the license fields.
- for license_expression, rules in rules_by_expression_to_update.items():
+ Iterate rules by license key, collect required phrases from the license attributes like name and
+ short name. Add those as required phrases in all selected rules that are using the
+ ``license_expression``.
+ """
+ rules_by_expression = get_updatable_rules_by_expression(license_expression, simple_expression=True)
- license_keys = licensing.license_keys(license_expression)
- if len(license_keys) != 1:
- continue
+ licenses_by_key = get_licenses_db()
- license_key = license_keys.pop()
+ # license expression is alway a single key here
+ for license_key, rules in rules_by_expression.items():
licence_object = licenses_by_key[license_key]
-
if verbose:
click.echo(f'Updating rules with required phrases for license_expression: {license_key}')
- add_required_phrases_for_license_fields(licence_object=licence_object, rules=rules, verbose=verbose)
+ add_license_attributes_as_required_phrases_to_rules_text(
+ license_object=licence_object,
+ rules=rules,
+ write_phrase_source=write_phrase_source,
+ dry_run=dry_run,
+ )
+
+####################################################################################################
+#
+# Inject new required phrase in rules
+#
+####################################################################################################
-def delete_required_phrase_rules_debug(rules_data_dir):
- required_phrase_rules = [
- rule
- for rule in load_rules(rules_data_dir=rules_data_dir)
- if rule.is_required_phrase
- ]
- for rule in required_phrase_rules:
- rule.dump(rules_data_dir)
+def delete_required_phrase_rules_source_debug(rules_data_dir):
+ """
+ Remove the "source" attribute from all rules.
+ """
+ for rule in load_rules(rules_data_dir=rules_data_dir):
+ if rule.source:
+ rule.source = None
+ rule.dump(rules_data_dir)
@click.command(name='add-required-phrases')
+@click.option(
+ "-o",
+ "--from-other-rules",
+ is_flag=True,
+ default=False,
+ help="Propagate existing required phrases from other rules to all selected rules. "
+ "Mutually exclusive with --from-license-attributes.",
+ cls=PluggableCommandLineOption,
+)
+@click.option(
+ "-a",
+ "--from-license-attributes",
+ is_flag=True,
+ default=False,
+ help="Propagate license attributes as required phrases to all selected rules. "
+ "Mutually exclusive with --from-other-rule.",
+ cls=PluggableCommandLineOption,
+)
@click.option(
"-l",
"--license-expression",
type=str,
default=None,
metavar="STRING",
- help="The license expression, for which the rules will be updated with required phrases. "
- "Example STRING: `mit`. If this option is not used, add required_phrases for all rules.",
+ help="Optional license expression filter. If provided, only consider the rules that are using "
+ "this expression. Otherwise, process all rules. Example: `apache-2.0`.",
cls=PluggableCommandLineOption,
)
@click.option(
- "-r",
- "--reindex",
+ "--validate",
is_flag=True,
default=False,
- help="Also reindex the license/rules to check for inconsistencies.",
+ help="Validate that all rules and licenses and rules are consistent, for all rule languages. "
+ "For this validation, run a mock indexing. The regenerated index is not saved to disk.",
cls=PluggableCommandLineOption,
)
@click.option(
- "-w",
- "--write-required-phrase-origins",
+ "-r",
+ "--reindex",
is_flag=True,
default=False,
- help="Write into the rule file the sources for all required phrase rules. Deletes the temporary rule origins used to debug.",
+ help="Recreate and cache the licenses index with updated rules add the end.",
cls=PluggableCommandLineOption,
)
@click.option(
- "-d",
- "--delete-required-phrase-origins",
+ "-w",
+ "--write-phrase-source",
is_flag=True,
default=False,
- help="Delete the sources for all required phrase rules and exit. This is a debug option.",
+ help="In modified rule files, write the source field to trace the source of required phrases "
+ "applied to that rule.",
cls=PluggableCommandLineOption,
)
@click.option(
- "-o",
- "--from-other-rules",
+ "-d",
+ "--delete-phrase-source",
is_flag=True,
default=False,
- help="Propagate required phrases from already marked required phrases in other rules.",
+ help="In rule files, delete the source extra debug data used to trace source of phrases.",
cls=PluggableCommandLineOption,
)
@click.option(
- "-a",
- "--from-license-attributes",
+ "--dry-run",
is_flag=True,
default=False,
- help="Mark required phrases from license attributes.",
+ help="Do not save rules.",
cls=PluggableCommandLineOption,
)
@click.option(
@@ -790,50 +661,300 @@ def delete_required_phrase_rules_debug(rules_data_dir):
"--verbose",
is_flag=True,
default=False,
- help="Print logging information.",
+ help="Print verbose logging information.",
cls=PluggableCommandLineOption,
)
@click.help_option("-h", "--help")
def add_required_phrases(
- license_expression,
- verbose,
- reindex,
from_other_rules,
from_license_attributes,
- delete_required_phrase_origins,
- write_required_phrase_origins,
+ license_expression,
+ validate,
+ reindex,
+ delete_phrase_source,
+ write_phrase_source,
+ dry_run,
+ verbose,
):
"""
- For all rules with the `license_expression`, add required phrases from the
- license fields.
+ Update license detection rules with new "required phrases" to improve rules detection accuracy.
"""
- licenses_by_key = load_licenses()
- if delete_required_phrase_origins:
- delete_required_phrase_rules_debug(rules_data_dir)
+ if delete_phrase_source:
+ click.echo('Deleting rules phrase source debug data.')
+ delete_required_phrase_rules_source_debug(rules_data_dir)
return
- # create a list of all required phrases from existing rules, add
- # rule files for them and mark those required phrases if present in other rules
- if from_other_rules:
- add_required_phrases_from_other_rules(
+ elif from_other_rules:
+ click.echo('Updating rules from is_required_phrase rules.')
+ update_rules_using_is_required_phrases_rules(
license_expression=license_expression,
- write_required_phrases=write_required_phrase_origins,
+ write_phrase_source=write_phrase_source,
+ dry_run=dry_run,
verbose=verbose,
- licenses_by_key=licenses_by_key,
)
- # marks required phrases in existing rules from license attributes like name,
- # short name and optionally license keys
- if from_license_attributes:
- add_required_phrases_from_license_fields(
+ elif from_license_attributes:
+ click.echo('Updating rules from license attributes.')
+ update_rules_using_license_attributes(
license_expression=license_expression,
+ write_phrase_source=write_phrase_source,
+ dry_run=dry_run,
verbose=verbose,
- licenses_by_key=licenses_by_key,
)
+ validate_and_reindex(validate, reindex, verbose)
+
+
+def validate_and_reindex(validate, reindex, verbose):
+ if validate:
+ if verbose:
+ click.echo('Validate all rules and licenses for all languages...')
+ build_index(index_all_languages=True)
+
if reindex:
- from licensedcode.cache import get_index
if verbose:
- click.echo('Rebuilding the license index...')
+ click.echo('Rebuilding and caching the license index...')
get_index(force=True)
+
+####################################################################################################
+#
+# Generate new required phrase rules from existing tagged required phrases
+#
+####################################################################################################
+
+
+@click.command(name='gen-new-required-phrases-rules')
+@click.option(
+ "-l",
+ "--license-expression",
+ type=str,
+ default=None,
+ metavar="STRING",
+ help="Optional license expression filter. If provided, only consider the rules that are using "
+ "this expression. Otherwise, process all rules. Example: `apache-2.0`.",
+ cls=PluggableCommandLineOption,
+)
+@click.option(
+ "-r",
+ "--reindex",
+ is_flag=True,
+ default=False,
+ help="Recreate and cache the licenses index with updated rules add the end.",
+ cls=PluggableCommandLineOption,
+)
+@click.option(
+ "--validate",
+ is_flag=True,
+ default=False,
+ help="Validate that all rules and licenses and rules are consistent, for all rule languages. "
+ "For this validation, run a mock indexing. The regenerated index is not saved to disk.",
+ cls=PluggableCommandLineOption,
+)
+@click.option(
+ "-v",
+ "--verbose",
+ is_flag=True,
+ default=False,
+ help="Print verbose logging information.",
+ cls=PluggableCommandLineOption,
+)
+@click.help_option("-h", "--help")
+def gen_required_phrases_rules(
+ license_expression,
+ validate,
+ reindex,
+ verbose,
+):
+ """
+ Create new license detection rules from "required phrases" in existing rules.
+ """
+ generate_new_required_phrase_rules(license_expression=license_expression, verbose=verbose)
+ validate_and_reindex(validate, reindex, verbose)
+
+
+def generate_new_required_phrase_rules(license_expression=None, verbose=False):
+ """
+ Create new rules ctreated from collecting unique required phrases accross all rules.
+
+ As a side effect, also update existing rules matched to a required phrase text with the
+ "is_required_phrase" flag.
+
+ Consider only rules with the optional ``license_expression`` if provided.
+ """
+ if verbose:
+ lex = license_expression or "all"
+ click.echo(f'Collecting required phrases for {lex} license_expression.')
+
+ index = get_index()
+ licenses_by_key = get_licenses_db()
+
+ # track text -> expressions to keep only a text that uniquely identifies a single expression
+ phrases_by_normalized_phrase = defaultdict(list)
+
+ for rule in index.rules_by_rid:
+ if rule.license_expression != license_expression:
+ continue
+
+ if (
+ rule.is_required_phrase
+ or rule.skip_for_required_phrase_generation
+ or rule.is_license_intro
+ or rule.is_license_clue
+ or rule.is_false_positive
+ or rule.is_generic(licenses_by_key)
+ ):
+ continue
+
+ for required_phrase_text in get_required_phrase_verbatim(rule.text):
+ phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=required_phrase_text)
+ if phrase.is_good(rule):
+ phrases_by_normalized_phrase[phrase.normalized_text].append(phrase)
+
+ # Add new variations of the required phrases already present in the list
+ for variation in generate_required_phrase_variations(required_phrase_text):
+ phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=variation)
+ if phrase.is_good(rule):
+ phrases_by_normalized_phrase[phrase.normalized_text].append(phrase)
+
+ for phrases in phrases_by_normalized_phrase.values():
+ # keep only phrases pointing used for the same expression
+ if len(set(p.license_expression for p in phrases)) == 1:
+ # keep the first one
+ phrase = phrases[0]
+ else:
+ continue
+
+ # check if we already have a rule we can match for this required phrase tag if needed
+ matched_rule = rule_exists(text=phrase.raw_text)
+ if matched_rule:
+ if matched_rule.skip_for_required_phrase_generation:
+ if verbose:
+ click.echo(
+ f'WARNING: Skipping pre-existing required phrase rule '
+ f'"skip_for_required_phrase_generation": {matched_rule.identifier}.'
+ )
+ continue
+
+ modified = False
+
+ if not matched_rule.is_required_phrase:
+ matched_rule.is_required_phrase = True
+ modified = True
+
+ if matched_rule.text.strip() != phrase.raw_text:
+ matched_rule.text = phrase.raw_text
+ modified = True
+
+ if matched_rule.is_continuous:
+ matched_rule.is_continuous = False
+ modified = True
+
+ if modified:
+ matched_rule.dump(rules_data_dir)
+ if verbose:
+ click.echo(f'WARNING: Updating existing rule with is_required flag and more: {matched_rule.identifier}.')
+ else:
+ if verbose:
+ click.echo(f'WARNING: Skipping pre-existing required phrase rule: {matched_rule.identifier}.')
+
+ continue
+
+ # at last create a new rule
+ rule = phrase.create_rule()
+ if verbose:
+ click.echo(f'Creating required phrase new rule: {rule.identifier}.')
+
+
+@attr.s
+class RequiredPhraseRuleCandidate:
+ """
+ A candidate phrase object with its license expression, raw text and normalized text. Used when
+ generating new rules for requireqed phrases.
+ """
+ license_expression = attr.ib(metadata=dict(help='A license expression string.'))
+ raw_text = attr.ib(metadata=dict(help='Raw, original required phrase text.'))
+ normalized_text = attr.ib(metadata=dict(help='Normalized required phrase text.'))
+
+ def is_good(self, rule):
+ """
+ Return True if this phrase is a minimally suitable to use as a required phrase
+ """
+ # long enough
+ num_tokens = len(get_normalized_tokens(self.normalized_text))
+ if num_tokens <= 1:
+ return False
+
+ to_ignore = set()
+ # not a referenced filename
+ to_ignore.update(map(get_normalized_text, rule.referenced_filenames))
+ if self.normalized_text in to_ignore:
+ return False
+
+ return True
+
+ @classmethod
+ def create(cls, license_expression, text):
+ return cls(
+ license_expression=license_expression,
+ raw_text=text,
+ normalized_text=get_normalized_text(text),
+ )
+
+ def create_rule(self):
+ """
+ Create, save and return a new "required_phrase" Rule from this phrase.
+ """
+ base_name = f"{self.license_expression}_required_phrase"
+ base_loc = find_rule_base_location(name_prefix=base_name)
+ file_path = f"{base_loc}.RULE"
+ identifier = file_path.split('/')[-1]
+
+ rule = Rule(
+ license_expression=self.license_expression,
+ identifier=identifier,
+ text=self.raw_text,
+ is_required_phrase=True,
+ is_license_reference=True,
+ )
+ update_ignorables(licensish=rule)
+ rule.dump(rules_data_dir)
+ return rule
+
+
+_verbatim_required_phrase = r'{{([^}]+)}}'
+collect_verbatim_required_phrase = re.compile(_verbatim_required_phrase, re.UNICODE).findall
+
+
+def get_required_phrase_verbatim(text):
+ """
+ Yield required_phrase strings from a rule ``text`` excluding required phrases {{brace}} markers.
+
+ This tokenizer behaves the same as as the ``index_tokenizer`` returning also
+ REQUIRED_PHRASE_OPEN and REQUIRED_PHRASE_CLOSE as separate tokens so that they can be
+ used to parse required phrases.
+
+ >>> x = list(get_required_phrase_verbatim('bar {{ AGPL-3.0 GNU Affero License v3.0 }} foo'))
+ >>> assert x == ['AGPL-3.0 GNU Affero License v3.0'], x
+
+ >>> x = list(get_required_phrase_verbatim(' + {{ ++ AGPL-3.0/}} and {{ GNU Affero License v3.0 }} '))
+ >>> assert x == ['++ AGPL-3.0/', 'GNU Affero License v3.0'], x
+ """
+ if not text:
+ return
+ for phrase in collect_verbatim_required_phrase(text):
+ phrase = phrase.strip()
+ if phrase:
+ yield phrase
+
+
+def generate_required_phrase_variations(text):
+ """
+ Yield strings that are useful variations of the ``text``, used to generate rule variants.
+ """
+ words_to_skip = ["the"]
+ required_phrase_words = text.split()
+ for skip_word in words_to_skip:
+ variant = [w for w in required_phrase_words if w.lower() != skip_word]
+ yield " ".join(variant)
+
diff --git a/tests/licensedcode/test_required_phrases.py b/tests/licensedcode/test_required_phrases.py
index 973294c69f..266ca8e2ff 100644
--- a/tests/licensedcode/test_required_phrases.py
+++ b/tests/licensedcode/test_required_phrases.py
@@ -7,108 +7,22 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#
-import os
from unittest import TestCase as TestCaseClass
import pytest
-from licensedcode.required_phrases import get_required_phrases
-from licensedcode.required_phrases import get_required_phrase_spans
-from licensedcode.required_phrases import get_required_phrase_texts
-from licensedcode.required_phrases import add_required_phrases_from_other_rules
-from licensedcode.required_phrases import add_required_phrases_from_license_fields
-from licensedcode.required_phrases import ListOfRequiredPhrases
-from licensedcode.required_phrases import RequiredPhraseDetails
-from licensedcode.required_phrases import return_spans_for_required_phrase_in_text
-from licensedcode.required_phrases import add_required_phrase_markers
-from licensedcode.tokenize import get_normalized_tokens
-from licensedcode.tokenize import matched_query_text_tokenizer
-from licensedcode.stopwords import STOPWORDS
from licensedcode.models import InvalidRule
from licensedcode.models import Rule
+from licensedcode.required_phrases import update_rules_using_is_required_phrases_rules
+from licensedcode.required_phrases import update_rules_using_license_attributes
+from licensedcode.required_phrases import IsRequiredPhrase
+from licensedcode.required_phrases import add_required_phrase_markers
from licensedcode.spans import Span
+from licensedcode.required_phrases import find_phrase_spans_in_text
+from licensedcode.tokenize import get_existing_required_phrase_spans
-class TestGetKeyPhrases(TestCaseClass):
- text = (
- 'This released software is {{released}} by under {{the MIT license}}. '
- 'Which is a license originating at Massachusetts Institute of Technology (MIT).'
- )
-
- def test_get_required_phrases_yields_spans(self):
- required_phrase_spans = get_required_phrase_spans(self.text)
- assert required_phrase_spans == [Span(4), Span(7, 9)]
-
- def test_get_required_phrases_yields_tokens(self):
- required_phrase_tokens = [
- required_phrase.required_phrase_tokens
- for required_phrase in get_required_phrases(text=self.text)
- ]
- assert required_phrase_tokens == [['released'], ['the', 'mit', 'license']]
-
- def test_get_required_phrase_texts(self):
- required_phrase_texts = get_required_phrase_texts(text=self.text)
- assert required_phrase_texts == ['released', 'the mit license']
-
- def test_get_required_phrases_raises_exception_required_phrase_markup_is_not_closed(self):
- text = 'This software is {{released by under the MIT license.'
- try:
- list(get_required_phrase_spans(text))
- raise Exception('Exception should be raised')
- except InvalidRule:
- pass
-
- def test_get_required_phrases_ignores_stopwords_in_positions(self):
- text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.'
- required_phrase_spans = get_required_phrase_spans(text)
- assert required_phrase_spans == [Span(11, 12)]
-
- def test_get_required_phrases_yields_spans_without_stop_words(self):
- text = 'This released software is {{released span}} by under {{the MIT quot license}}.'
- required_phrase_spans = get_required_phrase_spans(text)
- assert required_phrase_spans == [Span(4), Span(7, 9)]
-
- def test_get_required_phrases_does_not_yield_empty_spans(self):
- text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.'
- try:
- list(get_required_phrase_spans(text))
- raise Exception('Exception should be raised')
- except InvalidRule:
- pass
-
- def test_get_required_phrases_only_considers_outer_required_phrase_markup(self):
- text = 'This released {{{software under the MIT}}} license.'
- required_phrase_spans = get_required_phrase_spans(text)
- assert required_phrase_spans == [Span(2, 5)]
-
- def test_get_required_phrases_ignores_nested_required_phrase_markup(self):
- text = 'This released {{software {{under the}} MIT}} license.'
- try:
- list(get_required_phrase_spans(text))
- raise Exception('Exception should be raised')
- except InvalidRule:
- pass
-
- def test_get_required_phrase_texts_with_markup(self):
- text = (
- "Lua is free software distributed under the terms of the"
- "{{MIT license}}"
- "reproduced below;"
- )
- required_phrase_texts = get_required_phrase_texts(text=text)
- assert required_phrase_texts == ['mit license']
-
- def test_get_required_phrase_spans_with_markup(self):
- text = (
- "Lua is free software distributed under the terms of the"
- "{{MIT license}}"
- "reproduced below;"
- )
- required_phrase_spans = get_required_phrase_spans(text=text)
- assert required_phrase_spans == [Span(18, 19)]
-
-
-class TestListOfRequiredPhrases(TestCaseClass):
+class TestIsRequiredPhraseCanSort(TestCaseClass):
required_phrase_texts = [
"mit",
@@ -117,40 +31,35 @@ class TestListOfRequiredPhrases(TestCaseClass):
"licenses: mit",
"MIT license",
]
- required_phrases = [
- RequiredPhraseDetails(
+ is_required_phrases = [
+ IsRequiredPhrase(
required_phrase_text=text,
- license_expression="mit",
- length=len(text),
rule=Rule(
license_expression="mit",
identifier="mit_231.RULE",
text=text,
is_required_phrase=True,
is_license_tag=True,
- ),
- sources=["mit_231.RULE"],
+ )
)
for text in required_phrase_texts
]
- required_phrases_list = ListOfRequiredPhrases(required_phrases=required_phrases)
- def test_sort_required_phrases_works(self):
- self.required_phrases_list.sort_required_phrases()
- expected_sorted_texts = [
+ def test_sort_is_required_phrases_works(self):
+ srps = IsRequiredPhrase.sorted(self.is_required_phrases)
+ results = [srp.required_phrase_text for srp in srps]
+
+ expected = [
"MIT License with Disclaimer",
"the MIT License",
"licenses: mit",
"MIT license",
"mit",
]
- assert [
- required_phrase.required_phrase_text
- for required_phrase in self.required_phrases_list.required_phrases
- ] == expected_sorted_texts
+ assert results == expected
-class TestRequiredPhraseSpansinText:
+class TestFindPhraseInText:
text_with_stopwords = (
"A copy of the GNU General Public License is available as "
@@ -166,26 +75,27 @@ class TestRequiredPhraseSpansinText:
"/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution."
)
- def test_get_required_phrase_spans_with_or_without_specified_texts_is_same(self):
- required_phrase_spans_specified = return_spans_for_required_phrase_in_text(
+ def test_find_phrase_spans_in_text_with_behaves_same_as_get_existing_required_phrase_spans(self):
+ spans_with_phrase = find_phrase_spans_in_text(
text=self.text_with_stopwords,
- required_phrase="usr share common licenses gpl 2",
+ phrase_text="usr share common licenses gpl 2",
)
- required_phrase_spans_unspecified = get_required_phrase_spans(
+ spans_with_find = get_existing_required_phrase_spans(
text=self.text_with_stopwords_and_marked_required_phrases,
)
- assert required_phrase_spans_specified == required_phrase_spans_unspecified
- def test_get_required_phrase_and_add_required_phrase_matches(self):
+ assert spans_with_phrase == spans_with_find
+
+ def test_find_phrase_spans_in_text_and_add_required_phrase_matches(self):
- required_phrase_spans_specified = return_spans_for_required_phrase_in_text(
+ spans = find_phrase_spans_in_text(
text=self.text_with_stopwords,
- required_phrase="usr share common licenses gpl 2",
+ phrase_text="usr share common licenses gpl 2",
)
text = self.text_with_stopwords
- for span in required_phrase_spans_specified:
+ for span in spans:
text = add_required_phrase_markers(
text=text,
required_phrase_span=span,
@@ -193,12 +103,80 @@ def test_get_required_phrase_and_add_required_phrase_matches(self):
assert text == self.text_with_stopwords_and_marked_required_phrases
+
+class TestFindSpansInText:
+
+ text_with_articles = (
+ "A copy of the GNU General Public License is available as "
+ "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution. "
+ "A copy of the GNU General Public License is available as "
+ "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution."
+ )
+
+ text_with_articles_and_marked_required_phrases = (
+ "A copy of the GNU General Public License is available as "
+ "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution. "
+ "A copy of the GNU General Public License is available as "
+ "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution."
+ )
+
+ text_with_extra_characters = (
+ "This is the http://www.opensource.org/licenses/mit-license.php MIT "
+ "Software License which is OSI-certified, and GPL-compatible."
+ )
+
+ text_with_extra_characters_and_marked_required_phrases = (
+ "This is the http://www.opensource.org/licenses/mit-license.php {{MIT "
+ "Software License}} which is OSI-certified, and GPL-compatible."
+ )
+
+ def test_find_phrase_spans_in_text(self):
+ text = "is released under the MIT license. See the LICENSE"
+ spans = find_phrase_spans_in_text(text=text, phrase_text="mit license")
+ assert spans == [Span(4, 5)]
+
+ def test_find_phrase_spans_in_text_multiple(self):
+ spans = find_phrase_spans_in_text(
+ text=self.text_with_articles,
+ phrase_text="usr share common licenses gpl 2",
+ )
+ assert spans == [Span(10, 15), Span(32, 37)]
+
+ def test_find_phrase_spans_in_text_then_add_with_multiple_spans(self):
+ spans = find_phrase_spans_in_text(
+ text=self.text_with_articles,
+ phrase_text="usr share common licenses gpl 2",
+ )
+ text = self.text_with_articles
+ for span in spans:
+ text = add_required_phrase_markers(
+ text=text,
+ required_phrase_span=span,
+ )
+
+ assert text == self.text_with_articles_and_marked_required_phrases
+
+ def test_add_required_phrase_markers_in_text_with_extra_characters(self):
+ spans = find_phrase_spans_in_text(
+ text=self.text_with_extra_characters,
+ phrase_text="mit software license",
+ )
+ text = self.text_with_extra_characters
+ for span in spans:
+ text = add_required_phrase_markers(
+ text=text,
+ required_phrase_span=span,
+ )
+
+ assert text == self.text_with_extra_characters_and_marked_required_phrases
+
+
class TestKeyPhrasesCanBeMarked(TestCaseClass):
@pytest.mark.scanslow
- def can_more_key_phrases_be_marked_from_other_rules(self):
- add_required_phrases_from_other_rules(can_mark_required_phrase_test=True)
+ def test_update_rules_using_is_required_phrases_rules(self):
+ update_rules_using_is_required_phrases_rules(verbose=True, _dry_run=True)
@pytest.mark.scanslow
- def can_more_key_phrases_be_marked_from_license_attribtues(self):
- add_required_phrases_from_license_fields(can_mark_required_phrase_test=True)
+ def test_update_rules_using_license_attributes(self):
+ update_rules_using_license_attributes(verbose=True, _dry_run=True)