Skip to content

Commit

Permalink
Fix license detection bugs
Browse files Browse the repository at this point in the history
* Top level packages were inconsistant because of license plugin post
  processing running after the package plugin post processing.

* Adjust license clues/false-postives/bad matches heuristics.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Jul 21, 2023
1 parent 79871d3 commit cfa072b
Show file tree
Hide file tree
Showing 8 changed files with 283 additions and 651 deletions.
65 changes: 56 additions & 9 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from commoncode.text import python_safe_name
from licensedcode.cache import get_index
from licensedcode.cache import get_cache
from licensedcode.cache import build_spdx_license_expression
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.models import UnDetectedRule
Expand Down Expand Up @@ -105,6 +106,7 @@ class DetectionCategory(Enum):
EXTRA_WORDS = 'extra-words'
UNKNOWN_MATCH = 'unknown-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCHES = 'license-clues'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
FALSE_POSITVE = 'possible-false-positive'
UNDETECTED_LICENSE = 'undetected-license'
Expand Down Expand Up @@ -730,6 +732,10 @@ def collect_license_detections(codebase, include_license_clues=True):
)
if not detection_is_same:
package["declared_license_expression"] = license_expression
package["declared_license_expression_spdx"] = str(build_spdx_license_expression(
license_expression=license_expression,
licensing=get_cache().licensing,
))
modified = True

other_license_detections = package["other_license_detections"]
Expand All @@ -741,6 +747,10 @@ def collect_license_detections(codebase, include_license_clues=True):
)
if not detection_is_same:
package["other_license_expression"] = license_expression
package["other_license_expression_spdx"] = str(build_spdx_license_expression(
license_expression=license_expression,
licensing=get_cache().licensing,
))
modified = True

if modified:
Expand All @@ -753,6 +763,30 @@ def collect_license_detections(codebase, include_license_clues=True):
)
all_license_detections.extend(package_license_detection_objects)

if has_packages and has_licenses:
for package in getattr(codebase.attributes, 'packages', []):
license_expression_package = package["declared_license_expression"]
if not license_expression_package:
continue

resource_paths = package["datafile_paths"]
if len(resource_paths) == 1:
resource_path = resource_paths[0]
else:
#TODO: implement the correct consistency check
# based on which datafile path the license came from
resource_path = resource_paths[0]
resource = codebase.get_resource(path=resource_path)
resource_packages = getattr(resource, 'package_data', None)
if not resource_packages or len(resource_packages) > 1:
continue

resource_package = resource_packages[0]
if license_expression_package != resource_package["declared_license_expression"]:
package["license_detections"] = resource_package["license_detections"]
package["declared_license_expression"] = resource_package["declared_license_expression"]
package["declared_license_expression_spdx"] = resource_package["declared_license_expression_spdx"]

return all_license_detections


Expand Down Expand Up @@ -1107,7 +1141,7 @@ def has_correct_license_clue_matches(license_matches):
return is_correct_detection(license_matches) and all(match.rule.is_license_clue for match in license_matches)


def is_license_clues(license_matches):
def is_low_quality_matches(license_matches):
"""
Return True if the license_matches are not part of a correct
license detection and are mere license clues.
Expand Down Expand Up @@ -1329,6 +1363,14 @@ def get_detected_license_expression(
detection_log.append(DetectionRule.LICENSE_CLUES.value)
return detection_log, combined_expression

elif analysis == DetectionCategory.LOW_QUALITY_MATCHES.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
# TODO: we are temporarily returning these as license clues, and not
# in detections but ideally we should return synthetic unknowns for these
detection_log.append(DetectionRule.LOW_QUALITY_MATCHES.value)
return detection_log, combined_expression

else:
if TRACE_ANALYSIS:
logger_debug(f'analysis not-combined')
Expand Down Expand Up @@ -1510,8 +1552,8 @@ def analyze_detection(license_matches, package_license=False):
elif has_unknown_matches(license_matches=license_matches):
return DetectionCategory.UNKNOWN_MATCH.value

elif is_license_clues(license_matches=license_matches):
return DetectionCategory.LICENSE_CLUES.value
elif not package_license and is_low_quality_matches(license_matches=license_matches):
return DetectionCategory.LOW_QUALITY_MATCHES.value

# Case where at least one of the matches have `match_coverage`
# below IMPERFECT_MATCH_COVERAGE_THR
Expand Down Expand Up @@ -1644,19 +1686,24 @@ def process_detections(detections, licensing=Licensing()):

for detection in detections:
if detection.license_expression == None:
license_keys = licensing.license_keys(expression=detection.license_expression)
if all(
key in detected_license_keys
for key in license_keys
):
detection.license_expression = str(combine_expressions(
if has_correct_license_clue_matches(detection.matches):
continue

license_expression = str(combine_expressions(
expressions=[
match.rule.license_expression
for match in detection.matches
],
unique=True,
licensing=licensing,
))
license_keys = licensing.license_keys(expression=license_expression)

if all(
key in detected_license_keys
for key in license_keys
):
detection.license_expression = license_expression
detection.detection_log.append(DetectionRule.NOT_LICENSE_CLUES.value)
detection.identifier = detection.identifier_with_expression

Expand Down
70 changes: 49 additions & 21 deletions tests/formattedcode/data/yaml/package-and-licenses-expected.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,21 +90,21 @@ packages:
- license_expression: apache-2.0
matches:
- score: '100.0'
start_line: 4
end_line: 4
matched_length: 4
start_line: 1
end_line: 1
matched_length: 3
match_coverage: '100.0'
matcher: 2-aho
matcher: 1-hash
license_expression: apache-2.0
rule_identifier: apache-2.0_65.RULE
rule_identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
rule_relevance: 100
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_65.RULE
matched_text: license = Apache-2.0
identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
matched_text: Apache-2.0
identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
other_license_expression:
other_license_expression_spdx:
other_license_detections: []
extracted_license_statement:
extracted_license_statement: Apache-2.0
notice_text:
source_packages: []
extra_data: {}
Expand All @@ -119,12 +119,15 @@ packages:
purl: pkg:pypi/codebase
dependencies: []
license_detections:
- identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
license_expression: apache-2.0
detection_count: 2
- identifier: apache_2_0-ab23f79b-ec38-9a8a-9b23-85059407f34d
license_expression: apache-2.0
detection_count: 1
- identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
license_expression: apache-2.0
detection_count: 1
- identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
license_expression: apache-2.0
detection_count: 1
- identifier: apache_2_0_and__apache_2_0_or_mit-9b638e72-e872-a67f-3447-eec297ef7b39
license_expression: apache-2.0 AND (apache-2.0 OR mit)
detection_count: 1
Expand Down Expand Up @@ -706,6 +709,31 @@ license_rule_references:
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- license_expression: apache-2.0
identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
language: en
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
is_license_text: no
is_license_notice: no
is_license_reference: yes
is_license_tag: no
is_license_intro: no
is_license_clue: no
is_continuous: no
is_builtin: yes
is_from_license: no
is_synthetic: no
length: 3
relevance: 100
minimum_coverage: 100
referenced_filenames: []
notes: Used to detect a bare SPDX license id
ignorable_copyrights: []
ignorable_holders: []
ignorable_authors: []
ignorable_urls: []
ignorable_emails: []
text: apache-2.0
files:
- path: package-and-licenses
type: directory
Expand Down Expand Up @@ -1148,21 +1176,21 @@ files:
- license_expression: apache-2.0
matches:
- score: '100.0'
start_line: 4
end_line: 4
matched_length: 4
start_line: 1
end_line: 1
matched_length: 3
match_coverage: '100.0'
matcher: 2-aho
matcher: 1-hash
license_expression: apache-2.0
rule_identifier: apache-2.0_65.RULE
rule_identifier: spdx_license_id_apache-2.0_for_apache-2.0.RULE
rule_relevance: 100
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/apache-2.0_65.RULE
matched_text: license = Apache-2.0
identifier: apache_2_0-ec759ae0-ea5a-f138-793e-388520e080c0
rule_url: https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE
matched_text: Apache-2.0
identifier: apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8
other_license_expression:
other_license_expression_spdx:
other_license_detections: []
extracted_license_statement:
extracted_license_statement: Apache-2.0
notice_text:
source_packages: []
file_references: []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,6 @@
"detection_count": 1,
"detection_log": []
},
{
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656",
"license_expression": "bsd-simplified",
"detection_count": 1,
"detection_log": [
"license-clues",
"not-license-clues-as-more-detections-present"
]
},
{
"identifier": "bzip2_libbzip_2010-7158bcb2-a4d7-9815-17d2-1b1d0a6d5de2",
"license_expression": "bzip2-libbzip-2010",
Expand Down Expand Up @@ -1194,8 +1185,8 @@
{
"path": "python.LICENSE",
"type": "file",
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-simplified AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-2-Clause AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
"detected_license_expression": "python AND (other-copyleft AND gpl-1.0-plus) AND (python AND python-cwi) AND bzip2-libbzip-2010 AND sleepycat AND bsd-new AND openssl-ssleay AND openssl AND ssleay-windows AND tcl",
"detected_license_expression_spdx": "Python-2.0 AND (LicenseRef-scancode-other-copyleft AND GPL-1.0-or-later) AND (Python-2.0 AND LicenseRef-scancode-python-cwi) AND bzip2-1.0.6 AND Sleepycat AND BSD-3-Clause AND OpenSSL AND LicenseRef-scancode-openssl AND LicenseRef-scancode-ssleay-windows AND TCL",
"license_detections": [
{
"license_expression": "python",
Expand Down Expand Up @@ -1431,29 +1422,6 @@
],
"identifier": "sleepycat-a7cd8833-ecc2-8ade-54d7-392befcce801"
},
{
"license_expression": "bsd-simplified",
"matches": [
{
"score": 33.71,
"start_line": 358,
"end_line": 363,
"matched_length": 59,
"match_coverage": 33.71,
"matcher": "3-seq",
"license_expression": "bsd-simplified",
"rule_identifier": "bsd-simplified_242.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
}
],
"detection_log": [
"license-clues",
"not-license-clues-as-more-detections-present"
],
"identifier": "bsd_simplified-7517fbd6-3fa4-e9f9-2167-c65251d77656"
},
{
"license_expression": "bsd-new",
"matches": [
Expand Down Expand Up @@ -1653,7 +1621,21 @@
"identifier": "tcl-75d8de8c-9cf0-d604-4b99-e03436ebfcd3"
}
],
"license_clues": [],
"license_clues": [
{
"score": 33.71,
"start_line": 358,
"end_line": 363,
"matched_length": 59,
"match_coverage": 33.71,
"matcher": "3-seq",
"license_expression": "bsd-simplified",
"rule_identifier": "bsd-simplified_242.RULE",
"rule_relevance": 100,
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/bsd-simplified_242.RULE",
"matched_text": "INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF\n * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS\n * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN\n * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)\n * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF\n * THE POSSIBILITY OF SUCH DAMAGE."
}
],
"percentage_of_license_text": 83.64,
"scan_errors": []
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ other_license_detections:
2. Redistributions in binary form must the following disclaimer in
the documentation and/or other materials provided with the
distribution.
identifier:
identifier: bsd_new-7c8321ea-5f82-974c-692b-936bcaabf520
- license_expression: bsd-new
matches:
- score: '100.0'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3337,7 +3337,7 @@ other_license_detections:
matched_text: |
license notice, but it is assumed that it
is licensed under the same terms as
identifier:
identifier: artistic_perl_1_0_or_gpl_1_0_plus-f8a67153-d3ca-59da-be8b-68b1535b0862
- license_expression: artistic-perl-1.0 OR gpl-1.0-plus
matches:
- score: '100.0'
Expand Down
Loading

0 comments on commit cfa072b

Please sign in to comment.