Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor license_matches in summary following ScanCode-toolkit upgrade #808

Merged
merged 3 commits into from
Jul 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 34 additions & 34 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
# Visit https://github.com/nexB/scancode.io for support and download.

import concurrent.futures
import hashlib
import json
import logging
import multiprocessing
Expand Down Expand Up @@ -549,43 +548,44 @@ def set_codebase_resource_for_package(codebase_resource, discovered_package):
codebase_resource.update(status=flag.APPLICATION_PACKAGE)


def _get_license_matches_grouped(project):
def get_detection_data(detection_entry):
license_expression = detection_entry.get("license_expression")
identifier = detection_entry.get("identifier")
matches = []

for match in detection_entry.get("matches", []):
match_license_expression = match.get("license_expression")
# Do not include those match.expression when not part of this detection
# entry license_expression as those are not counted in the summary
if match_license_expression in license_expression:
matches.append(
{
"license_expression": match_license_expression,
"matched_text": match.get("matched_text"),
}
)

return {
"license_expression": license_expression,
"identifier": identifier,
"matches": matches,
}


def get_license_matches_grouped(project):
"""
Return a dictionary of all license_matches of a given `project` grouped by
license_expression.
Return a dictionary of all license_matches of a given ``project`` grouped by
``resource.detected_license_expression``.
"""
license_matches = defaultdict(list)
resources_with_license = project.codebaseresources.has_license_detections()
license_matches = defaultdict(dict)

for resource in resources_with_license:
file_cache = []

for detection_data in resource.license_detections:
detected_license_expression = detection_data.get("license_expression")
for match in detection_data.get("matches", []):
match_license_expression = match.get("license_expression")
# Do not include those match.expression when not part of the main
# detected_license_expression as those are not counted in the summary
if match_license_expression not in detected_license_expression:
continue

matched_text = match.get("matched_text")
# Do not include duplicated matched_text for a given license_expression
# within the same file
cache_key = ":".join(
[match_license_expression, resource.path, matched_text]
)
cache_key = hashlib.md5(cache_key.encode()).hexdigest()
if cache_key in file_cache:
continue
file_cache.append(cache_key)

license_matches[match_license_expression].append(
{
"path": resource.path,
"matched_text": matched_text,
}
)
matches = [
get_detection_data(detection_entry)
for detection_entry in resource.license_detections
]
license_matches[resource.detected_license_expression][resource.path] = matches

return dict(license_matches)

Expand All @@ -606,7 +606,7 @@ def make_results_summary(project, scan_results_location):

# Inject the generated `license_matches` in the summary from the project
# codebase resources.
summary["license_matches"] = _get_license_matches_grouped(project)
summary["license_matches"] = get_license_matches_grouped(project)

# Inject the `key_files` and their file content in the summary
key_files = []
Expand Down
36 changes: 26 additions & 10 deletions scanpipe/tests/data/is-npm-1.0.0_scan_package_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,32 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "package/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "package/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"package/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"package/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
68 changes: 50 additions & 18 deletions scanpipe/tests/data/multiple-is-npm-1.0.0_scan_package_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,56 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "is-npm/node_modules/is-npm/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "is-npm/node_modules/is-npm/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
},
{
"path": "is-npm/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "is-npm/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"is-npm/node_modules/is-npm/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"is-npm/node_modules/is-npm/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
],
"is-npm/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"is-npm/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
36 changes: 26 additions & 10 deletions scanpipe/tests/data/scancode/is-npm-1.0.0_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,32 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "package/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "package/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"package/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"package/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
50 changes: 50 additions & 0 deletions scanpipe/tests/pipes/test_scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,53 @@ def test_scanpipe_pipes_scancode_assemble_packages(self):
"test/get_package_resources/this-should-be-returned",
]
self.assertEqual(sorted(expected_resources), sorted(associated_resources))

def test_scanpipe_pipes_scancode_get_detection_data(self):
detection_entry = {
"matches": [
{
"score": 99.0,
"matcher": "2-aho",
"end_line": 76,
"start_line": 76,
"matched_text": "licensed under CC-BY-NC,",
"match_coverage": 100.0,
"matched_length": 5,
"rule_relevance": 99,
"rule_identifier": "cc-by-nc-4.0_16.RULE",
"license_expression": "cc-by-nc-4.0",
},
{
"score": 99.0,
"matcher": "2-aho",
"end_line": 76,
"start_line": 76,
"matched_text": "licensed under CC-BY-",
"match_coverage": 100.0,
"matched_length": 4,
"rule_relevance": 99,
"rule_identifier": "cc-by-4.0_84.RULE",
"license_expression": "cc-by-4.0",
},
],
"identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
"license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
}

expected = {
"license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
"identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
"matches": [
{
"license_expression": "cc-by-nc-4.0",
"matched_text": "licensed under CC-BY-NC,",
},
{
"license_expression": "cc-by-4.0",
"matched_text": "licensed under CC-BY-",
},
],
}

results = scancode.get_detection_data(detection_entry)
self.assertEqual(expected, results)