Skip to content

Commit

Permalink
Refactor license_matches in summary following ScanCode-toolkit upgrade (
Browse files Browse the repository at this point in the history
#808)

Signed-off-by: Thomas Druez <[email protected]>
  • Loading branch information
tdruez authored Jul 13, 2023
1 parent cad7f2e commit b49b444
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 72 deletions.
68 changes: 34 additions & 34 deletions scanpipe/pipes/scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
# Visit https://github.com/nexB/scancode.io for support and download.

import concurrent.futures
import hashlib
import json
import logging
import multiprocessing
Expand Down Expand Up @@ -549,43 +548,44 @@ def set_codebase_resource_for_package(codebase_resource, discovered_package):
codebase_resource.update(status=flag.APPLICATION_PACKAGE)


def _get_license_matches_grouped(project):
def get_detection_data(detection_entry):
license_expression = detection_entry.get("license_expression")
identifier = detection_entry.get("identifier")
matches = []

for match in detection_entry.get("matches", []):
match_license_expression = match.get("license_expression")
# Do not include those match.expression when not part of this detection
# entry license_expression as those are not counted in the summary
if match_license_expression in license_expression:
matches.append(
{
"license_expression": match_license_expression,
"matched_text": match.get("matched_text"),
}
)

return {
"license_expression": license_expression,
"identifier": identifier,
"matches": matches,
}


def get_license_matches_grouped(project):
"""
Return a dictionary of all license_matches of a given `project` grouped by
license_expression.
Return a dictionary of all license_matches of a given ``project`` grouped by
``resource.detected_license_expression``.
"""
license_matches = defaultdict(list)
resources_with_license = project.codebaseresources.has_license_detections()
license_matches = defaultdict(dict)

for resource in resources_with_license:
file_cache = []

for detection_data in resource.license_detections:
detected_license_expression = detection_data.get("license_expression")
for match in detection_data.get("matches", []):
match_license_expression = match.get("license_expression")
# Do not include those match.expression when not part of the main
# detected_license_expression as those are not counted in the summary
if match_license_expression not in detected_license_expression:
continue

matched_text = match.get("matched_text")
# Do not include duplicated matched_text for a given license_expression
# within the same file
cache_key = ":".join(
[match_license_expression, resource.path, matched_text]
)
cache_key = hashlib.md5(cache_key.encode()).hexdigest()
if cache_key in file_cache:
continue
file_cache.append(cache_key)

license_matches[match_license_expression].append(
{
"path": resource.path,
"matched_text": matched_text,
}
)
matches = [
get_detection_data(detection_entry)
for detection_entry in resource.license_detections
]
license_matches[resource.detected_license_expression][resource.path] = matches

return dict(license_matches)

Expand All @@ -606,7 +606,7 @@ def make_results_summary(project, scan_results_location):

# Inject the generated `license_matches` in the summary from the project
# codebase resources.
summary["license_matches"] = _get_license_matches_grouped(project)
summary["license_matches"] = get_license_matches_grouped(project)

# Inject the `key_files` and their file content in the summary
key_files = []
Expand Down
36 changes: 26 additions & 10 deletions scanpipe/tests/data/is-npm-1.0.0_scan_package_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,32 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "package/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "package/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"package/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"package/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
68 changes: 50 additions & 18 deletions scanpipe/tests/data/multiple-is-npm-1.0.0_scan_package_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,56 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "is-npm/node_modules/is-npm/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "is-npm/node_modules/is-npm/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
},
{
"path": "is-npm/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "is-npm/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"is-npm/node_modules/is-npm/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"is-npm/node_modules/is-npm/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
],
"is-npm/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"is-npm/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
36 changes: 26 additions & 10 deletions scanpipe/tests/data/scancode/is-npm-1.0.0_summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,32 @@
],
"other_languages": [],
"license_matches": {
"mit": [
{
"path": "package/package.json",
"matched_text": " \"license\": \"MIT\","
},
{
"path": "package/readme.md",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
"mit": {
"package/package.json": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": " \"license\": \"MIT\","
}
]
}
],
"package/readme.md": [
{
"license_expression": "mit",
"identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
"matches": [
{
"license_expression": "mit",
"matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
}
]
}
]
}
},
"key_files": [
{
Expand Down
50 changes: 50 additions & 0 deletions scanpipe/tests/pipes/test_scancode.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,53 @@ def test_scanpipe_pipes_scancode_assemble_packages(self):
"test/get_package_resources/this-should-be-returned",
]
self.assertEqual(sorted(expected_resources), sorted(associated_resources))

def test_scanpipe_pipes_scancode_get_detection_data(self):
detection_entry = {
"matches": [
{
"score": 99.0,
"matcher": "2-aho",
"end_line": 76,
"start_line": 76,
"matched_text": "licensed under CC-BY-NC,",
"match_coverage": 100.0,
"matched_length": 5,
"rule_relevance": 99,
"rule_identifier": "cc-by-nc-4.0_16.RULE",
"license_expression": "cc-by-nc-4.0",
},
{
"score": 99.0,
"matcher": "2-aho",
"end_line": 76,
"start_line": 76,
"matched_text": "licensed under CC-BY-",
"match_coverage": 100.0,
"matched_length": 4,
"rule_relevance": 99,
"rule_identifier": "cc-by-4.0_84.RULE",
"license_expression": "cc-by-4.0",
},
],
"identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
"license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
}

expected = {
"license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
"identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
"matches": [
{
"license_expression": "cc-by-nc-4.0",
"matched_text": "licensed under CC-BY-NC,",
},
{
"license_expression": "cc-by-4.0",
"matched_text": "licensed under CC-BY-",
},
],
}

results = scancode.get_detection_data(detection_entry)
self.assertEqual(expected, results)

0 comments on commit b49b444

Please sign in to comment.