aboutcode-org · tdruez · Jul 13, 2023 · Jul 6, 2023 · Jul 12, 2023 · Jul 12, 2023
diff --git a/scanpipe/pipes/scancode.py b/scanpipe/pipes/scancode.py
@@ -21,7 +21,6 @@
 # Visit https://github.com/nexB/scancode.io for support and download.
 
 import concurrent.futures
-import hashlib
 import json
 import logging
 import multiprocessing
@@ -549,43 +548,44 @@ def set_codebase_resource_for_package(codebase_resource, discovered_package):
     codebase_resource.update(status=flag.APPLICATION_PACKAGE)
 
 
-def _get_license_matches_grouped(project):
+def get_detection_data(detection_entry):
+    license_expression = detection_entry.get("license_expression")
+    identifier = detection_entry.get("identifier")
+    matches = []
+
+    for match in detection_entry.get("matches", []):
+        match_license_expression = match.get("license_expression")
+        # Do not include those match.expression when not part of this detection
+        # entry license_expression as those are not counted in the summary
+        if match_license_expression in license_expression:
+            matches.append(
+                {
+                    "license_expression": match_license_expression,
+                    "matched_text": match.get("matched_text"),
+                }
+            )
+
+    return {
+        "license_expression": license_expression,
+        "identifier": identifier,
+        "matches": matches,
+    }
+
+
+def get_license_matches_grouped(project):
     """
-    Return a dictionary of all license_matches of a given `project` grouped by
-    license_expression.
+    Return a dictionary of all license_matches of a given ``project`` grouped by
+    ``resource.detected_license_expression``.
     """
-    license_matches = defaultdict(list)
     resources_with_license = project.codebaseresources.has_license_detections()
+    license_matches = defaultdict(dict)
 
     for resource in resources_with_license:
-        file_cache = []
-
-        for detection_data in resource.license_detections:
-            detected_license_expression = detection_data.get("license_expression")
-            for match in detection_data.get("matches", []):
-                match_license_expression = match.get("license_expression")
-                # Do not include those match.expression when not part of the main
-                # detected_license_expression as those are not counted in the summary
-                if match_license_expression not in detected_license_expression:
-                    continue
-
-                matched_text = match.get("matched_text")
-                # Do not include duplicated matched_text for a given license_expression
-                # within the same file
-                cache_key = ":".join(
-                    [match_license_expression, resource.path, matched_text]
-                )
-                cache_key = hashlib.md5(cache_key.encode()).hexdigest()
-                if cache_key in file_cache:
-                    continue
-                file_cache.append(cache_key)
-
-                license_matches[match_license_expression].append(
-                    {
-                        "path": resource.path,
-                        "matched_text": matched_text,
-                    }
-                )
+        matches = [
+            get_detection_data(detection_entry)
+            for detection_entry in resource.license_detections
+        ]
+        license_matches[resource.detected_license_expression][resource.path] = matches
 
     return dict(license_matches)
 
@@ -606,7 +606,7 @@ def make_results_summary(project, scan_results_location):
 
     # Inject the generated `license_matches` in the summary from the project
     # codebase resources.
-    summary["license_matches"] = _get_license_matches_grouped(project)
+    summary["license_matches"] = get_license_matches_grouped(project)
 
     # Inject the `key_files` and their file content in the summary
     key_files = []

diff --git a/scanpipe/tests/data/is-npm-1.0.0_scan_package_summary.json b/scanpipe/tests/data/is-npm-1.0.0_scan_package_summary.json
@@ -25,16 +25,32 @@
   ],
   "other_languages": [],
   "license_matches": {
-    "mit": [
-      {
-        "path": "package/package.json",
-        "matched_text": "  \"license\": \"MIT\","
-      },
-      {
-        "path": "package/readme.md",
-        "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
-      }
-    ]
+    "mit": {
+      "package/package.json": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "  \"license\": \"MIT\","
+            }
+          ]
+        }
+      ],
+      "package/readme.md": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
+            }
+          ]
+        }
+      ]
+    }
   },
   "key_files": [
     {

diff --git a/scanpipe/tests/data/multiple-is-npm-1.0.0_scan_package_summary.json b/scanpipe/tests/data/multiple-is-npm-1.0.0_scan_package_summary.json
@@ -25,24 +25,56 @@
   ],
   "other_languages": [],
   "license_matches": {
-    "mit": [
-      {
-        "path": "is-npm/node_modules/is-npm/package.json",
-        "matched_text": "  \"license\": \"MIT\","
-      },
-      {
-        "path": "is-npm/node_modules/is-npm/readme.md",
-        "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
-      },
-      {
-        "path": "is-npm/package.json",
-        "matched_text": "  \"license\": \"MIT\","
-      },
-      {
-        "path": "is-npm/readme.md",
-        "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
-      }
-    ]
+    "mit": {
+      "is-npm/node_modules/is-npm/package.json": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "  \"license\": \"MIT\","
+            }
+          ]
+        }
+      ],
+      "is-npm/node_modules/is-npm/readme.md": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
+            }
+          ]
+        }
+      ],
+      "is-npm/package.json": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "  \"license\": \"MIT\","
+            }
+          ]
+        }
+      ],
+      "is-npm/readme.md": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
+            }
+          ]
+        }
+      ]
+    }
   },
   "key_files": [
     {

diff --git a/scanpipe/tests/data/scancode/is-npm-1.0.0_summary.json b/scanpipe/tests/data/scancode/is-npm-1.0.0_summary.json
@@ -25,16 +25,32 @@
   ],
   "other_languages": [],
   "license_matches": {
-    "mit": [
-      {
-        "path": "package/package.json",
-        "matched_text": "  \"license\": \"MIT\","
-      },
-      {
-        "path": "package/readme.md",
-        "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
-      }
-    ]
+    "mit": {
+      "package/package.json": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "  \"license\": \"MIT\","
+            }
+          ]
+        }
+      ],
+      "package/readme.md": [
+        {
+          "license_expression": "mit",
+          "identifier": "mit-3fce6ea2-8abd-6c6b-3ede-a37af7c6efee",
+          "matches": [
+            {
+              "license_expression": "mit",
+              "matched_text": "## License\n\nMIT \u00a9 [Sindre Sorhus](http://sindresorhus.com)"
+            }
+          ]
+        }
+      ]
+    }
   },
   "key_files": [
     {

diff --git a/scanpipe/tests/pipes/test_scancode.py b/scanpipe/tests/pipes/test_scancode.py
@@ -472,3 +472,53 @@ def test_scanpipe_pipes_scancode_assemble_packages(self):
             "test/get_package_resources/this-should-be-returned",
         ]
         self.assertEqual(sorted(expected_resources), sorted(associated_resources))
+
+    def test_scanpipe_pipes_scancode_get_detection_data(self):
+        detection_entry = {
+            "matches": [
+                {
+                    "score": 99.0,
+                    "matcher": "2-aho",
+                    "end_line": 76,
+                    "start_line": 76,
+                    "matched_text": "licensed under CC-BY-NC,",
+                    "match_coverage": 100.0,
+                    "matched_length": 5,
+                    "rule_relevance": 99,
+                    "rule_identifier": "cc-by-nc-4.0_16.RULE",
+                    "license_expression": "cc-by-nc-4.0",
+                },
+                {
+                    "score": 99.0,
+                    "matcher": "2-aho",
+                    "end_line": 76,
+                    "start_line": 76,
+                    "matched_text": "licensed under CC-BY-",
+                    "match_coverage": 100.0,
+                    "matched_length": 4,
+                    "rule_relevance": 99,
+                    "rule_identifier": "cc-by-4.0_84.RULE",
+                    "license_expression": "cc-by-4.0",
+                },
+            ],
+            "identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
+            "license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
+        }
+
+        expected = {
+            "license_expression": "cc-by-nc-4.0 AND cc-by-4.0",
+            "identifier": "cc_by_nc_4_0_and_cc_by_4_0-3e419bd6-97a4-a144-35ab",
+            "matches": [
+                {
+                    "license_expression": "cc-by-nc-4.0",
+                    "matched_text": "licensed under CC-BY-NC,",
+                },
+                {
+                    "license_expression": "cc-by-4.0",
+                    "matched_text": "licensed under CC-BY-",
+                },
+            ],
+        }
+
+        results = scancode.get_detection_data(detection_entry)
+        self.assertEqual(expected, results)