From 1bcf3fc3ecca7ec9a07124af2ff41ef277ad9685 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Wed, 9 Oct 2024 00:05:33 +0200 Subject: [PATCH] Update required phrase generation * This update decouples the creation of is_required_phrase rules from updating existing rules in a separate CLI. This makes it easier to control which rule are used as required phrases. * This now skip to process more rules when adding required phrases to existing rules: any rule that cannot be matched approximately is skipped and only tiny rules, but also many other rules. * This checks that no rule get a required phrase added that would break in the middle of a URL, email, or copyright. This is done by checking that no required phrase injection changes the set of ignorables of a rule and could break a URL making it no longer a proper URL. Same for emails or copyrights. * This extends "skipping" the collection of required phrases to skip a rule from both required phrases collection for generationg new rules AND injection of new required phrases in rule text. This allow to handle exceptions more easily. * The "is_required_phrase" rules creation now creates rules using improved content: the case and punctuation of the phrase text are preserved; the rule is created as "is_license_reference" which is going to be correct in the vast majority of the cases. * When matched, the "is_required_phrase" rules are treated the same as continuous rules and can only be matched exactly. * The "is_required_phrase" rules are now validated extensively to ensure that there is no conflict with other rule flags. * The code to "trace" the source of a required_phase inject now uses the new standard "source" rule field, and the code related to handling this field has been simplified. * Required phrases injection has not yet been tested as working. Signed-off-by: Philippe Ombredanne --- setup.cfg | 1 + src/licensedcode/match.py | 10 +- src/licensedcode/required_phrases.py | 1281 ++++++++++--------- tests/licensedcode/test_required_phrases.py | 220 ++-- 4 files changed, 807 insertions(+), 705 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5aa773107b..1c8df20664 100644 --- a/setup.cfg +++ b/setup.cfg @@ -159,6 +159,7 @@ console_scripts = scancode-license-data = licensedcode.license_db:dump_scancode_license_data regen-package-docs = packagedcode.regen_package_docs:regen_package_docs add-required-phrases = licensedcode.required_phrases:add_required_phrases + gen-new-required-phrases-rules = licensedcode.required_phrases:gen_required_phrases_rules # These are configurations for ScanCode plugins as setuptools entry points. # Each plugin entry hast this form: diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index b27e7503c5..7866d56632 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -2129,12 +2129,14 @@ def filter_matches_missing_required_phrases( A required phrase must be matched exactly without gaps or unknown words. A rule with "is_continuous" set to True is the same as if its whole text - was defined as a keyphrase and is processed here too. + was defined as a required phrase and is processed here too. + Same for a rule with "is_required_phrase" set to True. + """ - # never discard a solo match, unless matched to "is_continuous" rule + # never discard a solo match, unless matched to "is_continuous" or "is_required_phrase" rule if len(matches) == 1: rule = matches[0] - if not rule.is_continuous: + if not (rule.is_continuous or rule.is_required_phrase): return matches, [] kept = [] @@ -2149,7 +2151,7 @@ def filter_matches_missing_required_phrases( if trace: logger_debug(' CHECKING KEY PHRASES for:', match) - is_continuous = match.rule.is_continuous + is_continuous = match.rule.is_continuous or match.rule.is_required_phrase ikey_spans = match.rule.required_phrase_spans if not (ikey_spans or is_continuous): diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py index ccc96f0ddf..32035bb060 100644 --- a/src/licensedcode/required_phrases.py +++ b/src/licensedcode/required_phrases.py @@ -8,183 +8,84 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import re + +from collections import defaultdict + import attr -import os import click +from commoncode.cliutils import PluggableCommandLineOption from license_expression import Licensing -from licensedcode import TINY_RULE -from commoncode.cliutils import PluggableCommandLineOption +from licensedcode.cache import build_index +from licensedcode.cache import get_index +from licensedcode.cache import get_licenses_db +from licensedcode.models import find_rule_base_location +from licensedcode.models import get_ignorables +from licensedcode.models import get_normalized_ignorables from licensedcode.models import get_rules_by_expression -from licensedcode.models import load_licenses from licensedcode.models import load_rules -from licensedcode.models import InvalidRule from licensedcode.models import rules_data_dir from licensedcode.models import Rule from licensedcode.models import rule_exists -from licensedcode.models import find_rule_base_location - +from licensedcode.models import update_ignorables from licensedcode.spans import Span -from licensedcode.tokenize import required_phrase_tokenizer -from licensedcode.tokenize import index_tokenizer -from licensedcode.tokenize import return_spans_for_required_phrase_in_text -from licensedcode.tokenize import get_ignorable_spans -from licensedcode.tokenize import get_non_overlapping_spans -from licensedcode.tokenize import add_required_phrase_markers -from licensedcode.tokenize import REQUIRED_PHRASE_OPEN +from licensedcode.stopwords import STOPWORDS from licensedcode.tokenize import REQUIRED_PHRASE_CLOSE -from licensedcode.tokenize import get_normalized_tokens - - -# Add the rule identifier here to trace required phrase collection or required -# phrase marking for a specific rule (Example: "mit_12.RULE") -TRACE_REQUIRED_PHRASE_FOR_RULES = [] - - -def get_required_phrase_spans(text): - """ - Return a list of Spans representin required phrase token positions in the text - for each required phrase found in the rule ``text``. +from licensedcode.tokenize import REQUIRED_PHRASE_OPEN +from licensedcode.tokenize import required_phrase_tokenizer +from licensedcode.tokenize import matched_query_text_tokenizer +from licensedcode.tokenize import get_existing_required_phrase_spans - For example: +""" +This is a utility module for "required phrases". +This is a designed to run as a command line tool with extensive debugging and tracing facilitues. - >>> text = 'This is enclosed in {{double curly braces}}' - >>> # 0 1 2 3 4 5 6 - >>> x = get_required_phrase_spans(text) - >>> assert x == [Span(4, 6)], x +Usage: - >>> text = 'This is {{enclosed}} a {{double curly braces}} or not' - >>> # 0 1 2 SW 3 4 5 6 7 - >>> x = get_required_phrase_spans(text) - >>> assert x == [Span(2), Span(3, 5)], x +- start with gen-new-required-phrases-rules: this will create new rules from existing "required +phrases" found in rules. - >>> text = 'This {{is}} enclosed a {{double curly braces}} or not' - >>> # 0 1 2 SW 3 4 5 6 7 - >>> x = get_required_phrase_spans(text) - >>> assert x == [Span([1]), Span([3, 4, 5])], x +- regen the index - >>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}' - >>> # 0 1 2 3 4 5 6 7 8 9 - >>> x = get_required_phrase_spans(text) - >>> assert x == [Span(0, 9)], x +- then continue with add-required-phrases to update existing rules with required phrases found in +"is_required_phrase" rules and license attributes/fields. - >>> assert get_required_phrase_spans('{This}') == [] +""" - >>> def check_exception(text): - ... try: - ... return get_required_phrase_spans(text) - ... except InvalidRule: - ... pass +# Add rule identifiers here to trace required phrase collection or required +# phrase marking for a specific rule (Example: "mit_12.RULE") +TRACE_REQUIRED_PHRASE_FOR_RULES = [] - >>> check_exception('This {{is') - >>> check_exception('This }}is') - >>> check_exception('{{This }}is{{') - >>> check_exception('This }}is{{') - >>> check_exception('{{}}') - >>> check_exception('{{This is') - >>> check_exception('{{This is{{') - >>> check_exception('{{This is{{ }}') - >>> check_exception('{{{{This}}}}') - >>> check_exception('}}This {{is}}') - >>> check_exception('This }} {{is}}') - >>> check_exception('{{This}}') - [Span(0)] - >>> check_exception('{This}') - [] - >>> check_exception('{{{This}}}') - [Span(0)] - """ - return [ - required_phrase.span - for required_phrase in get_required_phrases(text) - ] +#################################################################################################### +# +# Shared utilities +# +#################################################################################################### -def get_required_phrase_texts(text): +def get_normalized_tokens(text, skip_required_phrase_markers=True, preserve_case=False): """ - Return a list of required phrase texts for each required phrase found - in the rule ``text``. - - For example: - - >>> text = 'This is enclosed in {{double curly braces}}' - >>> # 0 1 2 3 4 5 6 - >>> x = get_required_phrase_texts(text=text) - >>> assert x == ['double curly braces'], x + Return a list of normalized token strings in ``text``. """ - return [ - required_phrase.text - for required_phrase in get_required_phrases(text) - ] - - -@attr.s -class RequiredPhraseInText: - - required_phrase_positions = attr.ib( - default=attr.Factory(list), - repr=False, - metadata=dict(help='List of positions of a required phrase in a rule text.') - ) - - required_phrase_tokens = attr.ib( - default=attr.Factory(list), - metadata=dict(help='List of required phrase tokens for this rule.') - ) - - @property - def text(self): - """The full normalized text for this required phrase, built from its tokens.""" - return " ".join(self.required_phrase_tokens) - - @property - def span(self): - """A span representing the position of this required phrase in a rule text.""" - return Span(self.required_phrase_positions) - - def update(self, token, ipos): - self.required_phrase_tokens.append(token) - self.required_phrase_positions.append(ipos) - - -def get_required_phrases(text): - """ - Yield RequiredPhraseInText objects with both required phrase positions - and lists of tokens for each required phrase found in the rule ``text``. - Tokens form a required phrase when enclosed in {{double curly braces}}. - """ - ipos = 0 - in_required_phrase = False - required_phrase = RequiredPhraseInText() - for token in required_phrase_tokenizer(text): - if token == REQUIRED_PHRASE_OPEN: - if in_required_phrase: - raise InvalidRule('Invalid rule with nested required phrase {{ {{ braces', text) - in_required_phrase = True - - elif token == REQUIRED_PHRASE_CLOSE: - if in_required_phrase: - if required_phrase.required_phrase_tokens: - yield required_phrase - required_phrase = RequiredPhraseInText() - else: - raise InvalidRule('Invalid rule with empty required phrase {{}} braces', text) - in_required_phrase = False - else: - raise InvalidRule(f'Invalid rule with dangling required phrase missing closing braces', text) - continue - else: - if in_required_phrase: - required_phrase.update(token=token, ipos=ipos) - ipos += 1 - - if required_phrase.required_phrase_tokens or in_required_phrase: - raise InvalidRule(f'Invalid rule with dangling required phrase missing final closing braces', text) + required_phrase_markers = [REQUIRED_PHRASE_CLOSE, REQUIRED_PHRASE_OPEN] + tokens = list(required_phrase_tokenizer(text=text, preserve_case=preserve_case)) + if skip_required_phrase_markers: + tokens = [ + token + for token in tokens + if token not in required_phrase_markers + ] + return tokens def get_normalized_text(text, skip_required_phrase_markers=True): + """ + Return the normalized text for ``text``. Optionally ``skip_required_phrase_markers`` double + {{curly braces}}. + """ return " ".join( get_normalized_tokens( text=text, @@ -193,414 +94,290 @@ def get_normalized_text(text, skip_required_phrase_markers=True): ) -def get_num_tokens(text): - return len(get_normalized_tokens(text)) - -def is_text_license_reference(text): - - tokens = list(index_tokenizer(text=text)) - words_license_reference = ['http', 'https', 'io', 'com', 'txt', 'md', 'file'] - if any( - True - for word in words_license_reference - if word in tokens +def find_phrase_spans_in_text(text, phrase_text, preserve_case=False): + """ + Return a list of Spans where the ``phrase_text`` exists in ``text``, or an empty list. + """ + spans_with_required_phrase = [] + + text_tokens = list(get_normalized_tokens( + text=text, + preserve_case=preserve_case, + skip_required_phrase_markers=True, + )) + required_phrase_tokens = list(get_normalized_tokens( + text=phrase_text, + preserve_case=preserve_case, + skip_required_phrase_markers=True, + )) + required_phrase_first_token = required_phrase_tokens[0] + + # Initial check to see if all tokens in the required phrase are present + if all( + required_phrase_token in text_tokens + for required_phrase_token in required_phrase_tokens ): - return True + start_positions = [ + i + for i, x in enumerate(text_tokens) + if x == required_phrase_first_token + ] - return False + for start_pos in start_positions: + end_pos = start_pos + len(required_phrase_tokens) + if ( + end_pos <= len(text_tokens) + and text_tokens[start_pos:end_pos] == required_phrase_tokens + ): + spans_with_required_phrase.append(Span(start_pos, end_pos - 1)) -@attr.s -class RequiredPhraseDetails: + return spans_with_required_phrase - license_expression = attr.ib( - default=None, - metadata=dict( - help='A license expression string for this particular required phrase.') - ) - rule = attr.ib( - default=None, - metadata=dict( - help='The Rule object for this particular required phrase rule.') - ) +def get_non_overlapping_spans(old_required_phrase_spans, new_required_phrase_spans): + """ + Given two list of spans `old_required_phrase_spans` and `new_required_phrase_spans`, + return all the spans in `new_required_phrase_spans` that do not overlap with any + of the spans in `old_required_phrase_spans`. - required_phrase_text = attr.ib( - default=None, - metadata=dict( - help='Normalized required phrase text.') - ) + The list of spans `old_required_phrase_spans` contains all the spans of required + phrases or ignorables already present in a rule text, and the other list of spans + `new_required_phrase_spans` contains the proposed new required phrases. + """ + for new_span in new_required_phrase_spans: + if old_required_phrase_spans: + if any(old_span.overlap(new_span) != 0 for old_span in old_required_phrase_spans): + continue - sources = attr.ib( - default=attr.Factory(list), - metadata=dict( - help='List of all rule identifiers where this required phrase is present.' - ) - ) + yield new_span - length = attr.ib( - default=0, - metadata=dict( - help='Length of text for this required phrase text (used to sort).' - ) - ) - # Generic licenses should not be dumped as required phrase rules - has_generic_license = attr.ib( - default=False, - metadata=dict( - help='Has a generic license key in its license expression' - ) - ) +def add_required_phrase_markers(text, required_phrase_span): + """ + Given a ``text`` and a ``required_phrase_span`` Span, add required phrase + curly brace markers to the ``text`` before the start and after the of the span. + This is taking care of whitespace and stopwords. + """ + tokens_tuples_with_markers = [] + token_index = 0 - @classmethod - def create_required_phrase_details( - cls, - license_expression, - required_phrase_text, - sources, - length, - has_generic_license=False, - ): + for token_tuple in matched_query_text_tokenizer(text): - base_name = f"{license_expression}_required_phrase" - base_loc = find_rule_base_location(name_prefix=base_name) - file_path = f"{base_loc}.RULE" - identifier = file_path.split('/')[-1] + is_word, token = token_tuple - normalized_text = get_normalized_text(required_phrase_text) + if is_word and token.lower() not in STOPWORDS: + if token_index == required_phrase_span.start: + tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_OPEN)) - rule = Rule( - license_expression=license_expression, - identifier=identifier, - text=normalized_text, - is_required_phrase=True, - ) - if is_text_license_reference(required_phrase_text): - rule.is_license_reference = True - else: - rule.is_license_tag = True + token_index += 1 - if not has_generic_license: - rule.dump(rules_data_dir) + tokens_tuples_with_markers.append(token_tuple) - return cls( - license_expression=license_expression, - rule=rule, - required_phrase_text=normalized_text, - sources=sources, - length=length, - has_generic_license=has_generic_license, - ) + if is_word and token.lower() not in STOPWORDS: + if token_index == required_phrase_span.end + 1: + tokens_tuples_with_markers.append((False, REQUIRED_PHRASE_CLOSE)) - def update_sources(self, source_identifier): - if not source_identifier in self.sources: - self.sources.append(source_identifier) + return combine_tokens(tokens_tuples_with_markers) -@attr.s -class ListOfRequiredPhrases: +def combine_tokens(token_tuples): + """ + Returns a string `combined_text` combining token tuples from the list `token_tuples`, + which are token tuples created by the tokenizer functions. + """ + return ''.join(token for _, token in token_tuples) - required_phrases = attr.ib( - default=attr.Factory(list), - metadata=dict( - help='A list of RequiredPhraseDetails objects for all the required phrases.') - ) - def match_required_phrase_present(self, required_phrase_text): - """ - Check if a required_phrase_text is present in the list of required_phrases - or it is a rule in the index. - Note: Order is important, as the list of required_phrases has both new rules which are - not yet in the index and old rules also present in the index. - """ - normalized_text = get_normalized_text(required_phrase_text) +@attr.s +class IsRequiredPhrase: + """ + Represent a required phrase text and rule from an "is_required_phrase" Rule + """ - # check if this required_phrase_text is present in the collected list of required phrases - for required_phrase in self.required_phrases: - if required_phrase.required_phrase_text == normalized_text: - rule = required_phrase.rule - return rule + rule = attr.ib(metadata=dict(help='Rule that contains this phrase')) + required_phrase_text = attr.ib(metadata=dict(help='Normalized required phrase text.')) - # check if this required_phrase_text is present as a rule in the index - rule = rule_exists(text=required_phrase_text) - if rule: - return rule + @property + def license_expression(self): + self.rule.license_expression - def update_required_phrase_sources(self, rule, has_generic_license=False, different_license=False): + @staticmethod + def sorted(isrequiredphrases): """ - Given a rule update the required phrases list with this rule - - Note: this should only be called on a rule that is obtained from the - match_required_phrase_present function so that the rule is present in the - index/required phrases list. + Return an ``isrequiredphrases`` list of IsRequiredPhrase sorted by decreasing text length. """ - # if rule is present as a required phrase rule in the list then - # add identifier to sources of the required phrase rule - for required_phrase in self.required_phrases: - if required_phrase.rule.identifier == rule.identifier: - required_phrase.update_sources(rule.identifier) - return - - if rule and (rule.is_license_intro or rule.is_license_clue): - return - - # if rule is present as a rule in the index, set the is_required_phrase flag - # and add to the list of required phrase rules, if it is a non-generic license of - # the same license expression - if not rule.is_required_phrase and not has_generic_license and not different_license: - rule.is_required_phrase = True - rule.dump(rules_data_dir) + sorter = lambda p: (len(p.rule.text), p.required_phrase_text) + return sorted(isrequiredphrases, key=sorter, reverse=True) - normalized_text = get_normalized_text(rule.text) - required_phrase_detail = RequiredPhraseDetails( - license_expression=rule.license_expression, - rule=rule, - required_phrase_text=normalized_text, - sources=[rule.identifier], - length=len(normalized_text), - has_generic_license=has_generic_license, - ) - self.required_phrases.append(required_phrase_detail) - - def sort_required_phrases(self): - self.required_phrases = sorted( - self.required_phrases, - key=lambda x: x.length, - reverse=True, - ) - def add_variations_of_required_phrases(self, licenses_by_key): - - words_to_skip = ["the"] - for required_phrase in self.required_phrases: - required_phrase_tokens = list(index_tokenizer(text=required_phrase.required_phrase_text)) - skip_words_present = [ - skip_word - for skip_word in words_to_skip - if skip_word in required_phrase_tokens - ] - for skip_word in skip_words_present: - required_phrase_tokens.remove(skip_word) - required_phrase_without_skip_word = " ".join(required_phrase_tokens) - matched_rule = self.match_required_phrase_present(required_phrase_without_skip_word) - if matched_rule and matched_rule.skip_collecting_required_phrases: - continue - - has_generic_license = does_have_generic_licenses( - license_expression=required_phrase.license_expression, - licenses_by_key=licenses_by_key, - ) - if not matched_rule: - required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details( - license_expression=required_phrase.license_expression, - required_phrase_text=required_phrase_without_skip_word, - sources=[required_phrase.rule.identifier], - length=len(required_phrase_without_skip_word), - has_generic_license=has_generic_license, - ) - self.required_phrases.append(required_phrase_detail) - else: - self.update_required_phrase_sources( - rule=matched_rule, - has_generic_license=has_generic_license, - ) - - -def does_have_generic_licenses(license_expression, licenses_by_key): - licensing = Licensing() - license_keys = licensing.license_keys(license_expression) - has_generic_license = False - for lic_key in license_keys: - lic = licenses_by_key.get(lic_key) - if lic and ( - lic.is_generic or lic.is_unknown - ): - has_generic_license = True - break - - return has_generic_license - - -def collect_required_phrases_in_rules( - rules_by_expression, - licenses_by_key, - license_expression=None, - verbose=False, -): - - # A mapping of {license_expression: ListOfRequiredPhrases} for all applicable - # license_expressions - required_phrases_by_expression = {} - - licensing = Licensing() +def collect_is_required_phrase_from_rules(rules_by_expression, verbose=False): + """ + Return a mapping of ``{license_expression: list of [IsRequiredPhrase, ...]`` collecting the + texts of all rules in the ``rules_by_expression`` mapping if the "is_required_phrase" is True.. + """ + is_required_phrases_by_expression = {} - # collect and create required phrase rules for license_expression, rules in rules_by_expression.items(): - - license_keys = licensing.license_keys(license_expression) - if len(license_keys) != 1: - continue - if verbose: click.echo(f'Collecting required phrases for license_expression: {license_expression}') - required_phrases_list = ListOfRequiredPhrases() + is_required_phrases = [] for rule in rules: - if rule.skip_collecting_required_phrases: + if not rule.is_required_phrase: continue - if rule.is_license_intro or rule.is_license_clue: - continue - - for required_phrase_text in get_required_phrase_texts(rule.text): - if get_num_tokens(required_phrase_text) < 2: - if verbose: - click.echo(f'WARNING: single word required phrases in: {rule.identifier}, skipping.') - continue - - required_phrase_rule = required_phrases_list.match_required_phrase_present( - required_phrase_text=required_phrase_text, - ) - - debug = False - if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES: - debug = True - click.echo( - f"Collecting from rule: {rule.identifier} " - f"Required phrase: '{required_phrase_text}' " - f"Matched rule: {required_phrase_rule}" - ) - - if required_phrase_rule and required_phrase_rule.skip_collecting_required_phrases: - continue - - has_generic_license = does_have_generic_licenses( - license_expression=license_expression, - licenses_by_key=licenses_by_key, - ) - if required_phrase_rule: - different_license = required_phrase_rule.license_expression != license_expression - required_phrases_list.update_required_phrase_sources( - rule=required_phrase_rule, - has_generic_license=has_generic_license, - different_license=different_license, - ) - if debug: - click.echo(f"Old required phrase updated, same license expression") - - elif not is_text_license_reference(required_phrase_text): - required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details( - license_expression=license_expression, - required_phrase_text=required_phrase_text, - sources=[rule.identifier], - length=len(required_phrase_text), - has_generic_license=has_generic_license, - ) - required_phrases_list.required_phrases.append(required_phrase_detail) - if debug: - click.echo(f"New required phrase : {required_phrase_detail} ") - elif debug: - is_reference = is_text_license_reference(required_phrase_text) - click.echo(f"is_text_license_reference: {is_reference} ") + if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES: + click.echo(f"Collecting required phrase from rule: {rule.identifier}: {rule.text!r}") - # Add add new variations of the required phrases already present in the list - required_phrases_list.add_variations_of_required_phrases(licenses_by_key) + is_required_phrases.append(IsRequiredPhrase(rule=rule, required_phrase_text=rule.text)) - # We need to sort required phrases by length so we look for and mark the longest possible - # required phrases before the shorter ones contained in the same (substrings) - required_phrases_list.sort_required_phrases() - required_phrases_by_expression[license_expression] = required_phrases_list + # We need to sort required phrases by decreasing length so we look for and mark the longest + # possible required phrases before the shorter ones contained in the same text + is_required_phrases = IsRequiredPhrase.sorted(is_required_phrases) + is_required_phrases_by_expression[license_expression] = is_required_phrases if verbose: - count = len(required_phrases_list.required_phrases) - texts_with_source = { - required_phrase.required_phrase_text: required_phrase.sources - for required_phrase in required_phrases_list.required_phrases - } + count = len(is_required_phrases) click.echo(f'Collected {count} required phrases for license_expression: {license_expression}') click.echo('Collected required phrases texts: ') - for text, sources in texts_with_source.items(): - click.echo(f'{text}: {sources}') + for rqph in is_required_phrases: + click.echo(f' {rqph.required_phrase_text!r}: {rqph.rule.identifier}') - return required_phrases_by_expression + return is_required_phrases_by_expression -def update_required_phrases_from_other_rules( +def update_required_phrases_in_rules( required_phrases_by_expression, rules_by_expression, - write_required_phrases=False, + write_phrase_source=False, verbose=False, + dry_run=False, ): - - # add required phrases to rules from other rules + """ + Update the text of rules in a ``rules_by_expression`` mapping with required phrases from the + ``required_phrases_by_expression`` mapping. + If ``write_phrase_source`` is True, include debug information in the saved rule source field. + """ for license_expression, rules in rules_by_expression.items(): - if not license_expression in required_phrases_by_expression: + if license_expression not in required_phrases_by_expression: continue if verbose: click.echo(f'marking required phrases in rule texts for license_expression: {license_expression}') - required_phrases_for_expression = required_phrases_by_expression.get(license_expression) - add_required_phrases_for_required_phrases( + required_phrases = required_phrases_by_expression.get(license_expression) + if not required_phrases: + continue + + add_required_phrases_to_rules_text( + required_phrases=required_phrases, rules=rules, - required_phrases=required_phrases_for_expression.required_phrases, - verbose=verbose, + write_phrase_source=write_phrase_source, + dry_run=dry_run, ) - if write_required_phrases: - for license_expression, required_phrases_list in required_phrases_by_expression.items(): - if verbose: - click.echo(f'Writing required phrases sources for license_expression: {license_expression}') - - for required_phrase_detail in required_phrases_list.required_phrases: - if ( - required_phrase_detail.sources and required_phrase_detail.rule.is_required_phrase - and not required_phrase_detail.has_generic_license - ): - required_phrase_detail.rule.dump( - rules_data_dir=rules_data_dir, - sources=required_phrase_detail.sources - ) - -def add_required_phrases_from_other_rules( - licenses_by_key, +def update_rules_using_is_required_phrases_rules( license_expression=None, - write_required_phrases=False, + write_phrase_source=False, verbose=False, - can_mark_required_phrase_test=False, + dry_run=False, ): + """ + Add required phrases to rules using is_required_phrase rules. + Optionally filter rules with ``license_expression``. + """ + rules_by_expression = get_base_rules_by_expression(license_expression=license_expression) - rules_by_expression = get_rules_by_expression() - if license_expression: - rules_by_expression = {license_expression: rules_by_expression[license_expression]} - else: - rules_by_expression = rules_by_expression - - required_phrases_by_expression = collect_required_phrases_in_rules( - license_expression=license_expression, + required_phrases_by_expression = collect_is_required_phrase_from_rules( rules_by_expression=rules_by_expression, verbose=verbose, - licenses_by_key=licenses_by_key, ) + if verbose: + click.echo(f"update_rules_using_is_required_phrases_rules: required_phrases_by_expression # {len(required_phrases_by_expression)}") + + rules_by_expression = get_updatable_rules_by_expression( + license_expression, + simple_expression=False, + ) + if verbose: + click.echo(f"update_rules_using_is_required_phrases_rules: rules_by_expression # {len(rules_by_expression)}") - update_required_phrases_from_other_rules( + update_required_phrases_in_rules( required_phrases_by_expression=required_phrases_by_expression, rules_by_expression=rules_by_expression, - write_required_phrases=write_required_phrases, + write_phrase_source=write_phrase_source, verbose=verbose, + dry_run=dry_run, ) -def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=False): +def get_base_rules_by_expression(license_expression=None): + """ + Return a mapping of rules_by_expression, filtered for an optional ``license_expression``. + """ + rules_by_expression = get_rules_by_expression() + if license_expression: + rules_by_expression = {license_expression: rules_by_expression[license_expression]} - for rule in rules: - # skip small or required phrase rules - if len(rule.text) < TINY_RULE or rule.is_required_phrase: - continue + return rules_by_expression + + +def get_updatable_rules_by_expression(license_expression=None, simple_expression=True): + """ + Return a mapping of rules_by_expression, filtered for an optional ``license_expression``. + The rules are suitable to receive required phrase updates + If simple_expression is True, only consider lincense rules with a single license key. + """ + rules_by_expression = get_base_rules_by_expression() + + index = get_index() + licensing = Licensing() + + updatable_rules_by_expression = {} + + # filter rules to keep only updatable rules + for expression, rules in rules_by_expression.items(): + if simple_expression: + license_keys = licensing.license_keys(license_expression) + if len(license_keys) != 1: + continue + + updatable_rules = [] + for rule in rules: + # skip required phrase, false positive, tiny and and more + if rule.is_required_phrase or not rule.is_approx_matchable: + continue + # skip rules that ask to be skipped + if rule.skip_for_required_phrase_generation: + continue + + # skip non-approx matchable, they will be matche exactly + if not index.is_rule_approx_matchable(rule): + continue + + updatable_rules.append(rule) + + if updatable_rules: + updatable_rules_by_expression[expression] = updatable_rules + + return updatable_rules_by_expression + + +def add_required_phrases_to_rules_text( + required_phrases, + rules, + write_phrase_source=False, + dry_run=False, +): + """ + Add the ``required_phrases`` list of IsRequiredPhrase to each rule in a ``rules`` list of + license Rule. + """ + for rule in rules: for required_phrase in required_phrases: debug = False if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES: @@ -610,47 +387,96 @@ def add_required_phrases_for_required_phrases(required_phrases, rules, verbose=F ) debug = True + source = rule.source or "" + if write_phrase_source: + source += f" {required_phrase.rule.identifier}" + add_required_phrase_to_rule( rule=rule, required_phrase=required_phrase.required_phrase_text, - debug_data=required_phrase.sources, + source=source, debug=debug, + dry_run=dry_run, ) -def add_required_phrases_for_license_fields(licence_object, rules, verbose=False): +def add_license_attributes_as_required_phrases_to_rules_text( + license_object, + rules, + write_phrase_source=False, + dry_run=False, +): + """ + Add new required phrases to the ``rules`` list of Rule using the ``license_object`` License + fields for required phrases. + """ license_fields_mapping_by_order = { - "name": licence_object.name, - "short_name": licence_object.short_name, - #"key", - #"spdx_license_key" + "name": license_object.name, + "short_name": license_object.short_name, + # "key", + # "spdx_license_key", } for rule in rules: - # skip small rules - if len(rule.text) < TINY_RULE: - continue + for field_name, required_phrase_text in license_fields_mapping_by_order.values(): + debug = False + if rule.identifier in TRACE_REQUIRED_PHRASE_FOR_RULES: + click.echo( + f"Updating rule: {rule.identifier} " + f"with required phrase from license: {field_name!r}: {required_phrase_text!r}." + ) + debug = True - for license_field_value in license_fields_mapping_by_order.values(): - add_required_phrase_to_rule(rule=rule, required_phrase=license_field_value) + source = rule.source or "" + if write_phrase_source: + source += f" {license_object.key}.LICENSE : {field_name}" + add_required_phrase_to_rule( + rule=rule, + required_phrase=required_phrase_text, + source=source, + debug=debug, + dry_run=dry_run, + ) + + +def get_ignorable_spans(rule): + """ + Return a list of ignorable Spans for the ``rule``. + Ignorable spans are for URLs and referenced filenames present in a rule text. These should not + be messed up with when injecting new required phrases in a rule text. + """ + ignorable_spans = [] + ignorables = rule.referenced_filenames + rule.ignorable_urls + for ignorable in ignorables: + ignorable_spans.extend( + find_phrase_spans_in_text( + text=rule.text, + required_phrase=ignorable, + preserve_case=True, + ) + ) -def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=False): + return ignorable_spans - # Reload from file as there could be changes from other license fields - rule_file = os.path.join(rules_data_dir, rule.identifier) - reloaded_rule = Rule.from_file(rule_file) - # we get spans for name/short_name if they exist - new_required_phrase_spans = return_spans_for_required_phrase_in_text( - text=reloaded_rule.text, +def add_required_phrase_to_rule(rule, required_phrase, source, debug=False, dry_run=False): + """ + Update and save the ``rule`` Rule tagging the text with the ``required_phrase`` text. Skip + updating and saving the rule to disk under some conditions, like if ignorables would be changed. + Return True if the rule was updated and False otherwise. + """ + + # These are candidate spans for new requriedf_phrases, if they exist + new_required_phrase_spans = find_phrase_spans_in_text( + text=rule.text, required_phrase=required_phrase, ) # we get spans for already existing required phrases and ignorables - ignorable_spans = get_ignorable_spans(reloaded_rule) - old_required_phrase_spans = get_required_phrase_spans(reloaded_rule.text) + ignorable_spans = get_ignorable_spans(rule) + old_required_phrase_spans = get_existing_required_phrase_spans(rule.text) # we verify whether there are spans which overlap with the # already present required phrases or ignorables @@ -669,120 +495,165 @@ def add_required_phrase_to_rule(rule, required_phrase, debug_data=None, debug=Fa ignorable_debug = rule.referenced_filenames + rule.ignorable_urls click.echo(f"debug ignorables: {ignorable_debug}") - text_rule = reloaded_rule.text - # we add required phrase markers for the non-overlapping spans + new_rule_text = rule.text for span_to_add in spans_to_add: - text_rule = add_required_phrase_markers( - text=text_rule, + new_rule_text = add_required_phrase_markers( + text=new_rule_text, required_phrase_span=span_to_add, ) # write the rule on disk if there are any updates - if text_rule != reloaded_rule.text: + if new_rule_text == rule.text: + return False + + if has_ignorable_changes(rule=rule, updated_text=new_rule_text): if debug: click.echo( - f"Updating rule: {reloaded_rule.identifier} " + f"NOT Updating rule: {rule.identifier} " + f"because IGNORABLES would change " f"with required phrase: {required_phrase} " - f"debug data: {debug_data} /n" ) - reloaded_rule.text = text_rule - reloaded_rule.dump(rules_data_dir) + return False -def add_required_phrases_from_license_fields( - licenses_by_key, - license_expression=None, - verbose=False, - can_mark_required_phrase_test=False, -): + rule.source = source or None + rule.text = new_rule_text + if not dry_run: + if debug: + click.echo( + f"UPDATE: Updating rule: {rule.identifier} " + f"with required phrase: {required_phrase!r} " + f"source: {source!r}" + ) + rule.dump(rules_data_dir) + return True + + +def has_ignorable_changes(rule, updated_text): """ - For all rules with the `license_expression`, add required phrases from the - license fields. + Return True if there would be changes in the "ignorable_*" attributes of a ``rule`` Rule if its + text was to be updated with a new ``updated_text``. """ - rules_by_expression = get_rules_by_expression() + existing_ignorables = get_normalized_ignorables(rule) + updated_ignorables = get_ignorables(updated_text) + return existing_ignorables != updated_ignorables - if license_expression: - rules_by_expression_to_update = {license_expression: rules_by_expression[license_expression]} - else: - rules_by_expression_to_update = rules_by_expression - licensing = Licensing() +def update_rules_using_license_attributes( + license_expression=None, + write_phrase_source=False, + verbose=False, + dry_run=False, +): + """ + Add required phrases found in the license fields. - for license_expression, rules in rules_by_expression_to_update.items(): + Iterate rules by license key, collect required phrases from the license attributes like name and + short name. Add those as required phrases in all selected rules that are using the + ``license_expression``. + """ + rules_by_expression = get_updatable_rules_by_expression(license_expression, simple_expression=True) - license_keys = licensing.license_keys(license_expression) - if len(license_keys) != 1: - continue + licenses_by_key = get_licenses_db() - license_key = license_keys.pop() + # license expression is alway a single key here + for license_key, rules in rules_by_expression.items(): licence_object = licenses_by_key[license_key] - if verbose: click.echo(f'Updating rules with required phrases for license_expression: {license_key}') - add_required_phrases_for_license_fields(licence_object=licence_object, rules=rules, verbose=verbose) + add_license_attributes_as_required_phrases_to_rules_text( + license_object=licence_object, + rules=rules, + write_phrase_source=write_phrase_source, + dry_run=dry_run, + ) + +#################################################################################################### +# +# Inject new required phrase in rules +# +#################################################################################################### -def delete_required_phrase_rules_debug(rules_data_dir): - required_phrase_rules = [ - rule - for rule in load_rules(rules_data_dir=rules_data_dir) - if rule.is_required_phrase - ] - for rule in required_phrase_rules: - rule.dump(rules_data_dir) +def delete_required_phrase_rules_source_debug(rules_data_dir): + """ + Remove the "source" attribute from all rules. + """ + for rule in load_rules(rules_data_dir=rules_data_dir): + if rule.source: + rule.source = None + rule.dump(rules_data_dir) @click.command(name='add-required-phrases') +@click.option( + "-o", + "--from-other-rules", + is_flag=True, + default=False, + help="Propagate existing required phrases from other rules to all selected rules. " + "Mutually exclusive with --from-license-attributes.", + cls=PluggableCommandLineOption, +) +@click.option( + "-a", + "--from-license-attributes", + is_flag=True, + default=False, + help="Propagate license attributes as required phrases to all selected rules. " + "Mutually exclusive with --from-other-rule.", + cls=PluggableCommandLineOption, +) @click.option( "-l", "--license-expression", type=str, default=None, metavar="STRING", - help="The license expression, for which the rules will be updated with required phrases. " - "Example STRING: `mit`. If this option is not used, add required_phrases for all rules.", + help="Optional license expression filter. If provided, only consider the rules that are using " + "this expression. Otherwise, process all rules. Example: `apache-2.0`.", cls=PluggableCommandLineOption, ) @click.option( - "-r", - "--reindex", + "--validate", is_flag=True, default=False, - help="Also reindex the license/rules to check for inconsistencies.", + help="Validate that all rules and licenses and rules are consistent, for all rule languages. " + "For this validation, run a mock indexing. The regenerated index is not saved to disk.", cls=PluggableCommandLineOption, ) @click.option( - "-w", - "--write-required-phrase-origins", + "-r", + "--reindex", is_flag=True, default=False, - help="Write into the rule file the sources for all required phrase rules. Deletes the temporary rule origins used to debug.", + help="Recreate and cache the licenses index with updated rules add the end.", cls=PluggableCommandLineOption, ) @click.option( - "-d", - "--delete-required-phrase-origins", + "-w", + "--write-phrase-source", is_flag=True, default=False, - help="Delete the sources for all required phrase rules and exit. This is a debug option.", + help="In modified rule files, write the source field to trace the source of required phrases " + "applied to that rule.", cls=PluggableCommandLineOption, ) @click.option( - "-o", - "--from-other-rules", + "-d", + "--delete-phrase-source", is_flag=True, default=False, - help="Propagate required phrases from already marked required phrases in other rules.", + help="In rule files, delete the source extra debug data used to trace source of phrases.", cls=PluggableCommandLineOption, ) @click.option( - "-a", - "--from-license-attributes", + "--dry-run", is_flag=True, default=False, - help="Mark required phrases from license attributes.", + help="Do not save rules.", cls=PluggableCommandLineOption, ) @click.option( @@ -790,50 +661,300 @@ def delete_required_phrase_rules_debug(rules_data_dir): "--verbose", is_flag=True, default=False, - help="Print logging information.", + help="Print verbose logging information.", cls=PluggableCommandLineOption, ) @click.help_option("-h", "--help") def add_required_phrases( - license_expression, - verbose, - reindex, from_other_rules, from_license_attributes, - delete_required_phrase_origins, - write_required_phrase_origins, + license_expression, + validate, + reindex, + delete_phrase_source, + write_phrase_source, + dry_run, + verbose, ): """ - For all rules with the `license_expression`, add required phrases from the - license fields. + Update license detection rules with new "required phrases" to improve rules detection accuracy. """ - licenses_by_key = load_licenses() - if delete_required_phrase_origins: - delete_required_phrase_rules_debug(rules_data_dir) + if delete_phrase_source: + click.echo('Deleting rules phrase source debug data.') + delete_required_phrase_rules_source_debug(rules_data_dir) return - # create a list of all required phrases from existing rules, add - # rule files for them and mark those required phrases if present in other rules - if from_other_rules: - add_required_phrases_from_other_rules( + elif from_other_rules: + click.echo('Updating rules from is_required_phrase rules.') + update_rules_using_is_required_phrases_rules( license_expression=license_expression, - write_required_phrases=write_required_phrase_origins, + write_phrase_source=write_phrase_source, + dry_run=dry_run, verbose=verbose, - licenses_by_key=licenses_by_key, ) - # marks required phrases in existing rules from license attributes like name, - # short name and optionally license keys - if from_license_attributes: - add_required_phrases_from_license_fields( + elif from_license_attributes: + click.echo('Updating rules from license attributes.') + update_rules_using_license_attributes( license_expression=license_expression, + write_phrase_source=write_phrase_source, + dry_run=dry_run, verbose=verbose, - licenses_by_key=licenses_by_key, ) + validate_and_reindex(validate, reindex, verbose) + + +def validate_and_reindex(validate, reindex, verbose): + if validate: + if verbose: + click.echo('Validate all rules and licenses for all languages...') + build_index(index_all_languages=True) + if reindex: - from licensedcode.cache import get_index if verbose: - click.echo('Rebuilding the license index...') + click.echo('Rebuilding and caching the license index...') get_index(force=True) + +#################################################################################################### +# +# Generate new required phrase rules from existing tagged required phrases +# +#################################################################################################### + + +@click.command(name='gen-new-required-phrases-rules') +@click.option( + "-l", + "--license-expression", + type=str, + default=None, + metavar="STRING", + help="Optional license expression filter. If provided, only consider the rules that are using " + "this expression. Otherwise, process all rules. Example: `apache-2.0`.", + cls=PluggableCommandLineOption, +) +@click.option( + "-r", + "--reindex", + is_flag=True, + default=False, + help="Recreate and cache the licenses index with updated rules add the end.", + cls=PluggableCommandLineOption, +) +@click.option( + "--validate", + is_flag=True, + default=False, + help="Validate that all rules and licenses and rules are consistent, for all rule languages. " + "For this validation, run a mock indexing. The regenerated index is not saved to disk.", + cls=PluggableCommandLineOption, +) +@click.option( + "-v", + "--verbose", + is_flag=True, + default=False, + help="Print verbose logging information.", + cls=PluggableCommandLineOption, +) +@click.help_option("-h", "--help") +def gen_required_phrases_rules( + license_expression, + validate, + reindex, + verbose, +): + """ + Create new license detection rules from "required phrases" in existing rules. + """ + generate_new_required_phrase_rules(license_expression=license_expression, verbose=verbose) + validate_and_reindex(validate, reindex, verbose) + + +def generate_new_required_phrase_rules(license_expression=None, verbose=False): + """ + Create new rules ctreated from collecting unique required phrases accross all rules. + + As a side effect, also update existing rules matched to a required phrase text with the + "is_required_phrase" flag. + + Consider only rules with the optional ``license_expression`` if provided. + """ + if verbose: + lex = license_expression or "all" + click.echo(f'Collecting required phrases for {lex} license_expression.') + + index = get_index() + licenses_by_key = get_licenses_db() + + # track text -> expressions to keep only a text that uniquely identifies a single expression + phrases_by_normalized_phrase = defaultdict(list) + + for rule in index.rules_by_rid: + if rule.license_expression != license_expression: + continue + + if ( + rule.is_required_phrase + or rule.skip_for_required_phrase_generation + or rule.is_license_intro + or rule.is_license_clue + or rule.is_false_positive + or rule.is_generic(licenses_by_key) + ): + continue + + for required_phrase_text in get_required_phrase_verbatim(rule.text): + phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=required_phrase_text) + if phrase.is_good(rule): + phrases_by_normalized_phrase[phrase.normalized_text].append(phrase) + + # Add new variations of the required phrases already present in the list + for variation in generate_required_phrase_variations(required_phrase_text): + phrase = RequiredPhraseRuleCandidate.create(license_expression=license_expression, text=variation) + if phrase.is_good(rule): + phrases_by_normalized_phrase[phrase.normalized_text].append(phrase) + + for phrases in phrases_by_normalized_phrase.values(): + # keep only phrases pointing used for the same expression + if len(set(p.license_expression for p in phrases)) == 1: + # keep the first one + phrase = phrases[0] + else: + continue + + # check if we already have a rule we can match for this required phrase tag if needed + matched_rule = rule_exists(text=phrase.raw_text) + if matched_rule: + if matched_rule.skip_for_required_phrase_generation: + if verbose: + click.echo( + f'WARNING: Skipping pre-existing required phrase rule ' + f'"skip_for_required_phrase_generation": {matched_rule.identifier}.' + ) + continue + + modified = False + + if not matched_rule.is_required_phrase: + matched_rule.is_required_phrase = True + modified = True + + if matched_rule.text.strip() != phrase.raw_text: + matched_rule.text = phrase.raw_text + modified = True + + if matched_rule.is_continuous: + matched_rule.is_continuous = False + modified = True + + if modified: + matched_rule.dump(rules_data_dir) + if verbose: + click.echo(f'WARNING: Updating existing rule with is_required flag and more: {matched_rule.identifier}.') + else: + if verbose: + click.echo(f'WARNING: Skipping pre-existing required phrase rule: {matched_rule.identifier}.') + + continue + + # at last create a new rule + rule = phrase.create_rule() + if verbose: + click.echo(f'Creating required phrase new rule: {rule.identifier}.') + + +@attr.s +class RequiredPhraseRuleCandidate: + """ + A candidate phrase object with its license expression, raw text and normalized text. Used when + generating new rules for requireqed phrases. + """ + license_expression = attr.ib(metadata=dict(help='A license expression string.')) + raw_text = attr.ib(metadata=dict(help='Raw, original required phrase text.')) + normalized_text = attr.ib(metadata=dict(help='Normalized required phrase text.')) + + def is_good(self, rule): + """ + Return True if this phrase is a minimally suitable to use as a required phrase + """ + # long enough + num_tokens = len(get_normalized_tokens(self.normalized_text)) + if num_tokens <= 1: + return False + + to_ignore = set() + # not a referenced filename + to_ignore.update(map(get_normalized_text, rule.referenced_filenames)) + if self.normalized_text in to_ignore: + return False + + return True + + @classmethod + def create(cls, license_expression, text): + return cls( + license_expression=license_expression, + raw_text=text, + normalized_text=get_normalized_text(text), + ) + + def create_rule(self): + """ + Create, save and return a new "required_phrase" Rule from this phrase. + """ + base_name = f"{self.license_expression}_required_phrase" + base_loc = find_rule_base_location(name_prefix=base_name) + file_path = f"{base_loc}.RULE" + identifier = file_path.split('/')[-1] + + rule = Rule( + license_expression=self.license_expression, + identifier=identifier, + text=self.raw_text, + is_required_phrase=True, + is_license_reference=True, + ) + update_ignorables(licensish=rule) + rule.dump(rules_data_dir) + return rule + + +_verbatim_required_phrase = r'{{([^}]+)}}' +collect_verbatim_required_phrase = re.compile(_verbatim_required_phrase, re.UNICODE).findall + + +def get_required_phrase_verbatim(text): + """ + Yield required_phrase strings from a rule ``text`` excluding required phrases {{brace}} markers. + + This tokenizer behaves the same as as the ``index_tokenizer`` returning also + REQUIRED_PHRASE_OPEN and REQUIRED_PHRASE_CLOSE as separate tokens so that they can be + used to parse required phrases. + + >>> x = list(get_required_phrase_verbatim('bar {{ AGPL-3.0 GNU Affero License v3.0 }} foo')) + >>> assert x == ['AGPL-3.0 GNU Affero License v3.0'], x + + >>> x = list(get_required_phrase_verbatim(' + {{ ++ AGPL-3.0/}} and {{ GNU Affero License v3.0 }} ')) + >>> assert x == ['++ AGPL-3.0/', 'GNU Affero License v3.0'], x + """ + if not text: + return + for phrase in collect_verbatim_required_phrase(text): + phrase = phrase.strip() + if phrase: + yield phrase + + +def generate_required_phrase_variations(text): + """ + Yield strings that are useful variations of the ``text``, used to generate rule variants. + """ + words_to_skip = ["the"] + required_phrase_words = text.split() + for skip_word in words_to_skip: + variant = [w for w in required_phrase_words if w.lower() != skip_word] + yield " ".join(variant) + diff --git a/tests/licensedcode/test_required_phrases.py b/tests/licensedcode/test_required_phrases.py index 973294c69f..266ca8e2ff 100644 --- a/tests/licensedcode/test_required_phrases.py +++ b/tests/licensedcode/test_required_phrases.py @@ -7,108 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import os from unittest import TestCase as TestCaseClass import pytest -from licensedcode.required_phrases import get_required_phrases -from licensedcode.required_phrases import get_required_phrase_spans -from licensedcode.required_phrases import get_required_phrase_texts -from licensedcode.required_phrases import add_required_phrases_from_other_rules -from licensedcode.required_phrases import add_required_phrases_from_license_fields -from licensedcode.required_phrases import ListOfRequiredPhrases -from licensedcode.required_phrases import RequiredPhraseDetails -from licensedcode.required_phrases import return_spans_for_required_phrase_in_text -from licensedcode.required_phrases import add_required_phrase_markers -from licensedcode.tokenize import get_normalized_tokens -from licensedcode.tokenize import matched_query_text_tokenizer -from licensedcode.stopwords import STOPWORDS from licensedcode.models import InvalidRule from licensedcode.models import Rule +from licensedcode.required_phrases import update_rules_using_is_required_phrases_rules +from licensedcode.required_phrases import update_rules_using_license_attributes +from licensedcode.required_phrases import IsRequiredPhrase +from licensedcode.required_phrases import add_required_phrase_markers from licensedcode.spans import Span +from licensedcode.required_phrases import find_phrase_spans_in_text +from licensedcode.tokenize import get_existing_required_phrase_spans -class TestGetKeyPhrases(TestCaseClass): - text = ( - 'This released software is {{released}} by under {{the MIT license}}. ' - 'Which is a license originating at Massachusetts Institute of Technology (MIT).' - ) - - def test_get_required_phrases_yields_spans(self): - required_phrase_spans = get_required_phrase_spans(self.text) - assert required_phrase_spans == [Span(4), Span(7, 9)] - - def test_get_required_phrases_yields_tokens(self): - required_phrase_tokens = [ - required_phrase.required_phrase_tokens - for required_phrase in get_required_phrases(text=self.text) - ] - assert required_phrase_tokens == [['released'], ['the', 'mit', 'license']] - - def test_get_required_phrase_texts(self): - required_phrase_texts = get_required_phrase_texts(text=self.text) - assert required_phrase_texts == ['released', 'the mit license'] - - def test_get_required_phrases_raises_exception_required_phrase_markup_is_not_closed(self): - text = 'This software is {{released by under the MIT license.' - try: - list(get_required_phrase_spans(text)) - raise Exception('Exception should be raised') - except InvalidRule: - pass - - def test_get_required_phrases_ignores_stopwords_in_positions(self): - text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.' - required_phrase_spans = get_required_phrase_spans(text) - assert required_phrase_spans == [Span(11, 12)] - - def test_get_required_phrases_yields_spans_without_stop_words(self): - text = 'This released software is {{released span}} by under {{the MIT quot license}}.' - required_phrase_spans = get_required_phrase_spans(text) - assert required_phrase_spans == [Span(4), Span(7, 9)] - - def test_get_required_phrases_does_not_yield_empty_spans(self): - text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.' - try: - list(get_required_phrase_spans(text)) - raise Exception('Exception should be raised') - except InvalidRule: - pass - - def test_get_required_phrases_only_considers_outer_required_phrase_markup(self): - text = 'This released {{{software under the MIT}}} license.' - required_phrase_spans = get_required_phrase_spans(text) - assert required_phrase_spans == [Span(2, 5)] - - def test_get_required_phrases_ignores_nested_required_phrase_markup(self): - text = 'This released {{software {{under the}} MIT}} license.' - try: - list(get_required_phrase_spans(text)) - raise Exception('Exception should be raised') - except InvalidRule: - pass - - def test_get_required_phrase_texts_with_markup(self): - text = ( - "Lua is free software distributed under the terms of the" - "{{MIT license}}" - "reproduced below;" - ) - required_phrase_texts = get_required_phrase_texts(text=text) - assert required_phrase_texts == ['mit license'] - - def test_get_required_phrase_spans_with_markup(self): - text = ( - "Lua is free software distributed under the terms of the" - "{{MIT license}}" - "reproduced below;" - ) - required_phrase_spans = get_required_phrase_spans(text=text) - assert required_phrase_spans == [Span(18, 19)] - - -class TestListOfRequiredPhrases(TestCaseClass): +class TestIsRequiredPhraseCanSort(TestCaseClass): required_phrase_texts = [ "mit", @@ -117,40 +31,35 @@ class TestListOfRequiredPhrases(TestCaseClass): "licenses: mit", "MIT license", ] - required_phrases = [ - RequiredPhraseDetails( + is_required_phrases = [ + IsRequiredPhrase( required_phrase_text=text, - license_expression="mit", - length=len(text), rule=Rule( license_expression="mit", identifier="mit_231.RULE", text=text, is_required_phrase=True, is_license_tag=True, - ), - sources=["mit_231.RULE"], + ) ) for text in required_phrase_texts ] - required_phrases_list = ListOfRequiredPhrases(required_phrases=required_phrases) - def test_sort_required_phrases_works(self): - self.required_phrases_list.sort_required_phrases() - expected_sorted_texts = [ + def test_sort_is_required_phrases_works(self): + srps = IsRequiredPhrase.sorted(self.is_required_phrases) + results = [srp.required_phrase_text for srp in srps] + + expected = [ "MIT License with Disclaimer", "the MIT License", "licenses: mit", "MIT license", "mit", ] - assert [ - required_phrase.required_phrase_text - for required_phrase in self.required_phrases_list.required_phrases - ] == expected_sorted_texts + assert results == expected -class TestRequiredPhraseSpansinText: +class TestFindPhraseInText: text_with_stopwords = ( "A copy of the GNU General Public License is available as " @@ -166,26 +75,27 @@ class TestRequiredPhraseSpansinText: "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution." ) - def test_get_required_phrase_spans_with_or_without_specified_texts_is_same(self): - required_phrase_spans_specified = return_spans_for_required_phrase_in_text( + def test_find_phrase_spans_in_text_with_behaves_same_as_get_existing_required_phrase_spans(self): + spans_with_phrase = find_phrase_spans_in_text( text=self.text_with_stopwords, - required_phrase="usr share common licenses gpl 2", + phrase_text="usr share common licenses gpl 2", ) - required_phrase_spans_unspecified = get_required_phrase_spans( + spans_with_find = get_existing_required_phrase_spans( text=self.text_with_stopwords_and_marked_required_phrases, ) - assert required_phrase_spans_specified == required_phrase_spans_unspecified - def test_get_required_phrase_and_add_required_phrase_matches(self): + assert spans_with_phrase == spans_with_find + + def test_find_phrase_spans_in_text_and_add_required_phrase_matches(self): - required_phrase_spans_specified = return_spans_for_required_phrase_in_text( + spans = find_phrase_spans_in_text( text=self.text_with_stopwords, - required_phrase="usr share common licenses gpl 2", + phrase_text="usr share common licenses gpl 2", ) text = self.text_with_stopwords - for span in required_phrase_spans_specified: + for span in spans: text = add_required_phrase_markers( text=text, required_phrase_span=span, @@ -193,12 +103,80 @@ def test_get_required_phrase_and_add_required_phrase_matches(self): assert text == self.text_with_stopwords_and_marked_required_phrases + +class TestFindSpansInText: + + text_with_articles = ( + "A copy of the GNU General Public License is available as " + "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution. " + "A copy of the GNU General Public License is available as " + "/usr/share/common-licenses/GPL-2 in the Debian GNU/Linux distribution." + ) + + text_with_articles_and_marked_required_phrases = ( + "A copy of the GNU General Public License is available as " + "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution. " + "A copy of the GNU General Public License is available as " + "/{{usr/share/common-licenses/GPL-2}} in the Debian GNU/Linux distribution." + ) + + text_with_extra_characters = ( + "This is the http://www.opensource.org/licenses/mit-license.php MIT " + "Software License which is OSI-certified, and GPL-compatible." + ) + + text_with_extra_characters_and_marked_required_phrases = ( + "This is the http://www.opensource.org/licenses/mit-license.php {{MIT " + "Software License}} which is OSI-certified, and GPL-compatible." + ) + + def test_find_phrase_spans_in_text(self): + text = "is released under the MIT license. See the LICENSE" + spans = find_phrase_spans_in_text(text=text, phrase_text="mit license") + assert spans == [Span(4, 5)] + + def test_find_phrase_spans_in_text_multiple(self): + spans = find_phrase_spans_in_text( + text=self.text_with_articles, + phrase_text="usr share common licenses gpl 2", + ) + assert spans == [Span(10, 15), Span(32, 37)] + + def test_find_phrase_spans_in_text_then_add_with_multiple_spans(self): + spans = find_phrase_spans_in_text( + text=self.text_with_articles, + phrase_text="usr share common licenses gpl 2", + ) + text = self.text_with_articles + for span in spans: + text = add_required_phrase_markers( + text=text, + required_phrase_span=span, + ) + + assert text == self.text_with_articles_and_marked_required_phrases + + def test_add_required_phrase_markers_in_text_with_extra_characters(self): + spans = find_phrase_spans_in_text( + text=self.text_with_extra_characters, + phrase_text="mit software license", + ) + text = self.text_with_extra_characters + for span in spans: + text = add_required_phrase_markers( + text=text, + required_phrase_span=span, + ) + + assert text == self.text_with_extra_characters_and_marked_required_phrases + + class TestKeyPhrasesCanBeMarked(TestCaseClass): @pytest.mark.scanslow - def can_more_key_phrases_be_marked_from_other_rules(self): - add_required_phrases_from_other_rules(can_mark_required_phrase_test=True) + def test_update_rules_using_is_required_phrases_rules(self): + update_rules_using_is_required_phrases_rules(verbose=True, _dry_run=True) @pytest.mark.scanslow - def can_more_key_phrases_be_marked_from_license_attribtues(self): - add_required_phrases_from_license_fields(can_mark_required_phrase_test=True) + def test_update_rules_using_license_attributes(self): + update_rules_using_license_attributes(verbose=True, _dry_run=True)