Skip to content

Commit

Permalink
Add script for adding required phrases automatically
Browse files Browse the repository at this point in the history
Add a script which can add required phrases in already existing rules
automatically from required phrases already present in other rules and
license field names. This can be done one license expression at a time.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Sep 2, 2024
1 parent 745ba9c commit e3b7f3b
Show file tree
Hide file tree
Showing 34 changed files with 1,599 additions and 330 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ More (advanced) rules options:
be present in the result license detections. These just have the license text and a
`is_false_positive` flag set to True.

- you can specify key phrases by surrounding one or more words between the `{{`
- you can specify required phrases by surrounding one or more words between the `{{`
and `}}` tags. Key phrases are words that **must** be matched/present in order
for a RULE to be considered a match.

Expand Down
20 changes: 2 additions & 18 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from licensedcode import models
from licensedcode import match_hash
from licensedcode import frontmatter
from licensedcode.models import get_rule_id_for_text
from license_expression import Licensing

"""
Expand Down Expand Up @@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
return rules


def rule_exists(text):
"""
Return the matched rule identifier if the text is an existing rule matched
exactly, False otherwise.
"""
idx = cache.get_index()

matches = idx.match(query_string=text)
if not matches:
return False
if len(matches) > 1:
return False
match = matches[0]
if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
return match.rule.identifier


def all_rule_by_tokens():
"""
Return a mapping of {tuples of tokens: rule id}, with one item for each
Expand Down Expand Up @@ -346,7 +330,7 @@ def cli(licenses_file, dump_to_file_on_errors=False):

text = rule.text

existing_rule = rule_exists(text)
existing_rule = get_rule_id_for_text(text)
skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")

existing_msg = (
Expand Down
2 changes: 2 additions & 0 deletions etc/scripts/licenses/report_license_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
"is_license_reference",
"is_license_intro",
"is_license_clue",
"is_required_phrase",
"skip_collecting_required_phrases",
"is_deprecated",
"has_unknown",
"only_known_words",
Expand Down
1 change: 1 addition & 0 deletions setup-mini.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ console_scripts =
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases

# These are configurations for ScanCode plugins as setuptools entry points.
# Each plugin entry hast this form:
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ console_scripts =
scancode-reindex-licenses = licensedcode.reindex:reindex_licenses
scancode-license-data = licensedcode.license_db:dump_scancode_license_data
regen-package-docs = packagedcode.regen_package_docs:regen_package_docs
add-required-phrases = licensedcode.required_phrases:add_required_phrases

# These are configurations for ScanCode plugins as setuptools entry points.
# Each plugin entry hast this form:
Expand Down
10 changes: 10 additions & 0 deletions src/licensedcode/data/rules/cclrc_1.RULE
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
license_expression: cclrc
is_license_notice: yes
referenced_filenames:
- External_License/CCLRC_CDAT_License.txt
---

* This software may be distributed under the terms of the
* {{CCLRC Licence}} for CCLRC Software
* <CDATDIR>/External_License/CCLRC_CDAT_License.txt
7 changes: 7 additions & 0 deletions src/licensedcode/data/rules/cclrc_2.RULE
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
license_expression: cclrc
is_license_notice: yes
---

* This software may be distributed under the terms of the
* {{CCLRC Licence}} for CCLRC Software
1 change: 1 addition & 0 deletions src/licensedcode/data/rules/mit_1155.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
license_expression: mit
is_license_notice: yes
relevance: 100
skip_collecting_required_phrases: yes
---

For license terms see {{SLF4J}}
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/mit_1302.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ referenced_filenames:
- LICENSE
---

is free software: you can redistribute it and/or modify it under the terms of {{the MIT License}} as published by the Open Source Initiative. See the {{ LICENSE file }} for more details.
is free software: you can redistribute it and/or modify it under the terms of {{the MIT License}} as published by the Open Source Initiative. See the LICENSE file for more details.
4 changes: 2 additions & 2 deletions src/licensedcode/data/rules/mit_397.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ relevance: 100
referenced_filenames:
- COPYING
ignorable_urls:
- http://www.opensource.org/licenses/mit-license.php
- https://www.opensource.org/licenses/mit-license.php
---

// Distributed under the MIT software license, see the accompanying
// file COPYING or shttp://www.opensource.org/licenses/mit-license.php.
// file COPYING or https://www.opensource.org/licenses/mit-license.php.
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/mit_9.RULE
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ ignorable_urls:
- http://en.wikipedia.org/wiki/MIT_License
---

Licensed under the MIT (http://en.wikipedia.org/wiki/MIT_License) license.
{{Licensed under the MIT}} (http://en.wikipedia.org/wiki/MIT_License) license.
46 changes: 23 additions & 23 deletions src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
TRACE_FILTER_BELOW_MIN_SCORE = False
TRACE_FILTER_SINGLE_WORD_GIBBERISH = False
TRACE_SET_LINES = False
TRACE_KEY_PHRASES = False
TRACE_REQUIRED_PHRASES = False
TRACE_REGIONS = False
TRACE_FILTER_LICENSE_LIST = False
TRACE_FILTER_LICENSE_LIST_DETAILED = False
Expand Down Expand Up @@ -91,7 +91,7 @@ def logger_debug(*args): pass
or TRACE_MATCHED_TEXT_DETAILS
or TRACE_HIGHLIGHTED_TEXT
or TRACE_FILTER_SINGLE_WORD_GIBBERISH
or TRACE_KEY_PHRASES
or TRACE_REQUIRED_PHRASES
or TRACE_REGIONS
or TRACE_FILTER_LICENSE_LIST
or TRACE_FILTER_LICENSE_LIST_DETAILED
Expand Down Expand Up @@ -133,7 +133,7 @@ def _debug_print_matched_query_text(match, extras=5):

class DiscardReason(IntEnum):
NOT_DISCARDED = 0
MISSING_KEY_PHRASES = 1
MISSING_REQUIRED_PHRASES = 1
BELOW_MIN_COVERAGE = 2
SPURIOUS_SINGLE_TOKEN = 3
TOO_SHORT = 4
Expand Down Expand Up @@ -634,15 +634,15 @@ def combine(self, other):
discard_reason = DiscardReason.NOT_DISCARDED

elif (
self.discard_reason == DiscardReason.MISSING_KEY_PHRASES
and other.discard_reason == DiscardReason.MISSING_KEY_PHRASES
self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
and other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES
):
discard_reason = DiscardReason.MISSING_KEY_PHRASES
discard_reason = DiscardReason.MISSING_REQUIRED_PHRASES

elif self.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
elif self.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
discard_reason = other.discard_reason

elif other.discard_reason == DiscardReason.MISSING_KEY_PHRASES:
elif other.discard_reason == DiscardReason.MISSING_REQUIRED_PHRASES:
discard_reason = self.discard_reason

else:
Expand Down Expand Up @@ -2116,17 +2116,17 @@ def filter_false_positive_matches(
return kept, discarded


def filter_matches_missing_key_phrases(
def filter_matches_missing_required_phrases(
matches,
trace=TRACE_KEY_PHRASES,
reason=DiscardReason.MISSING_KEY_PHRASES,
trace=TRACE_REQUIRED_PHRASES,
reason=DiscardReason.MISSING_REQUIRED_PHRASES,
):
"""
Return a filtered list of kept LicenseMatch matches and a list of
discardable matches given a ``matches`` list of LicenseMatch by removing
all ``matches`` that do not contain all key phrases defined in their matched
all ``matches`` that do not contain all required phrases defined in their matched
rule.
A key phrase must be matched exactly without gaps or unknown words.
A required phrase must be matched exactly without gaps or unknown words.
A rule with "is_continuous" set to True is the same as if its whole text
was defined as a keyphrase and is processed here too.
Expand All @@ -2143,14 +2143,14 @@ def filter_matches_missing_key_phrases(
discarded_append = discarded.append

if trace:
logger_debug('filter_matches_missing_key_phrases')
logger_debug('filter_matches_missing_required_phrases')

for match in matches:
if trace:
logger_debug(' CHECKING KEY PHRASES for:', match)

is_continuous = match.rule.is_continuous
ikey_spans = match.rule.key_phrase_spans
ikey_spans = match.rule.required_phrase_spans

if not (ikey_spans or is_continuous):
kept_append(match)
Expand Down Expand Up @@ -2180,11 +2180,11 @@ def filter_matches_missing_key_phrases(
# use whole ispan in this case
ikey_spans = [match.ispan]

# keep matches as candidate if they contain all key phrase positions in the ispan
# keep matches as candidate if they contain all required phrase positions in the ispan
if trace:
print(' CANDIDATE TO KEEP: all ikey_span in match.ispan:', ikey_spans, ispan)

# discard matches that contain key phrases, but interrupted by
# discard matches that contain required phrases, but interrupted by
# unknown or stop words.

unknown_by_pos = match.query.unknowns_by_pos
Expand All @@ -2195,7 +2195,7 @@ def filter_matches_missing_key_phrases(
istopwords_by_pos = match.rule.stopwords_by_pos
istopwords_by_pos_get = istopwords_by_pos.get

# iterate on each key phrase span to ensure that they are continuous
# iterate on each required phrase span to ensure that they are continuous
# and contain no unknown words on the query side

is_valid = True
Expand All @@ -2204,7 +2204,7 @@ def filter_matches_missing_key_phrases(

for ikey_span in ikey_spans:

# check that are no gaps in the key phrase span on the query side
# check that are no gaps in the required phrase span on the query side
# BUT, do not redo the check for is_continuous already checked above
if is_continuous:
qkey_span = qspan
Expand All @@ -2225,13 +2225,13 @@ def filter_matches_missing_key_phrases(
is_valid = False
break

# check that key phrase spans does not contain stop words and does
# check that required phrase spans does not contain stop words and does
# not contain unknown words

# NOTE: we do not check the last qkey_span position of a key phrase
# NOTE: we do not check the last qkey_span position of a required phrase
# since unknown is a number of words after a given span position:
# these are pinned to the last position and we would not care for
# what unknown or stop words show up after a key phrase ends.
# what unknown or stop words show up after a required phrase ends.

qkey_span_end = qkey_span.end
contains_unknown = any(
Expand Down Expand Up @@ -2694,7 +2694,7 @@ def _log(_matches, _discarded, msg):
# FIXME: we should have only a single loop on all the matches at once!!
# and not 10's of loops!!!

matches, discarded = filter_matches_missing_key_phrases(matches)
matches, discarded = filter_matches_missing_required_phrases(matches)
all_discarded_extend(discarded)
_log(matches, discarded, 'HAS KEY PHRASES')

Expand Down
Loading

0 comments on commit e3b7f3b

Please sign in to comment.