Merge pull request #3752 from nexB/misc-copyrights

* Detect odd name in copyright #3655 Reported-by: Anton Augsburg @vw-anton Reference: #3655 Signed-off-by: Philippe Ombredanne <[email protected]> * Do not detect trailing Distributed in copyright #3735 Reported-by: Dimitris Iliou @dimitris-iliou Reference: #3735 Signed-off-by: Philippe Ombredanne <[email protected]> * Improve misc. copyright detections Spotted in some common python libraries such as numpy and scipy Signed-off-by: Philippe Ombredanne <[email protected]> * Add new script to generate copyright tests Use an input file where each line is either: - a URL to fetch - a text to test Then generate a test data files pair accordingly Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection - Start detecting "is held by" - Do not include some trailing junk Signed-off-by: Philippe Ombredanne <[email protected]> * Detect NN/EMAIL copyright combo #3764 Reference: #3764 Reported-by: Anton Augsburg @vw-anton Signed-off-by: Philippe Ombredanne <[email protected]> * Detect NN/EMAIL copyright combo #3764 Make detection of copyright with a single lowercase name more specific Reference: #3764 Reported-by: Anton Augsburg @vw-anton Signed-off-by: Philippe Ombredanne <[email protected]> * Align license with improved copyrights Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection of "distributed" Signed-off-by: Philippe Ombredanne <[email protected]> * Do not detect some words as NNP This makes copyright detection more specific Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright tests Signed-off-by: Philippe Ombredanne <[email protected]> * Detect OpenStreetMap correctly Signed-off-by: Philippe Ombredanne <[email protected]> * Add new copyright detection tests Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection side-effects Signed-off-by: Philippe Ombredanne <[email protected]> * Enable generation of copyright test file Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright debug tracing Signed-off-by: Philippe Ombredanne <[email protected]> * Detect new form of copyright Signed-off-by: Philippe Ombredanne <[email protected]> * Do not add arbitrary space around markup Signed-off-by: Philippe Ombredanne <[email protected]> * Improve handle of parens in copyright Also improve NOTICEs, and other misc. variants Don not detect "The Initial Developer" Signed-off-by: Philippe Ombredanne <[email protected]> * Correctly filter copyrights in licenses #3797 Reference: #3797 Reported-by: Jörg Arndt @Joerki Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection Handle corner cases with markup Detect new copyright forms. Signed-off-by: Philippe Ombredanne <[email protected]> * Rename README file Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection * Handle better various parens, markup and quotes Signed-off-by: Philippe Ombredanne <[email protected]> * Improve copyright detection Signed-off-by: Philippe Ombredanne <[email protected]> * Refine copyright detection Signed-off-by: Philippe Ombredanne <[email protected]> * Use latest commoncode Signed-off-by: Philippe Ombredanne <[email protected]> * Enable generation of copyright test data files Signed-off-by: Philippe Ombredanne <[email protected]> * Do not regen demarkup tests Signed-off-by: Philippe Ombredanne <[email protected]> Co-authored-by: Ayan Sinha Mahapatra <[email protected]> --------- Signed-off-by: Philippe Ombredanne <[email protected]> Co-authored-by: Ayan Sinha Mahapatra <[email protected]>
aboutcode-org · Jun 26, 2024 · 1242518 · 1242518
2 parents e4f6267 + f07eaee
commit 1242518
Show file tree

Hide file tree

Showing 234 changed files with 2,672 additions and 669 deletions.
diff --git a/etc/scripts/gen_copyright_tests.py b/etc/scripts/gen_copyright_tests.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/skeleton for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import time
+
+from datetime import datetime
+
+import click
+import requests
+
+
+def timestamp():
+    return datetime.utcnow().isoformat().split("T")[0]
+
+
+EMPTY_COPY_TEST = """what:
+  - copyrights
+  - holders
+copyrights:
+holders:
+"""
+
+
+@click.command()
+@click.option(
+    "-u",
+    "--urls",
+    "urls_file",
+    type=click.Path(exists=True, readable=True, path_type=str, dir_okay=False),
+    metavar="URLS-FILE",
+    multiple=False,
+    required=True,
+    help="Path to URLs file, one per line.",
+)
+@click.help_option("-h", "--help")
+def create_copyright_tests(
+    urls_file,
+):
+    """
+    Download the URLs listed in the URLS-FILE and create a copyight test for each in the current
+    directory.
+
+    If a line number is provided as a URL fragment #L2, uses only 5 lines before and after this
+    line.
+    
+    If the URL is a plain GitHub URL, convert the URL to a raw URL.
+    If the URL does not start with http it is treated as a plain copyright text to test
+    """
+
+    with open(urls_file) as urls:
+        for i, url in enumerate(urls):
+            url = url.strip()
+            if not url:
+                continue
+
+            name = ""
+            if url.startswith("http"):
+                print(f"Fetching URL: {url}")
+                if url.startswith("https://github.com"):
+                    url = url.replace("https://github.com", "https://raw.githubusercontent.com")
+                    url = url.replace("/blob/", "/")
+
+                if "github" in url:
+                    segs = url.split("/")
+                    org = segs[3]
+                    repo = segs[4]
+                    name = f"copyright-test-{timestamp()}-{i}-{org}-{repo}.copyright"
+            else:
+                print(f"Processing test: {url}")
+                name = f"copyright-test-{timestamp()}-{i}.copyright"
+
+
+            start_line = 0
+            end_line = 0
+            if "#L" in url:
+                _, _, line = url.rpartition("#L")
+                line = int(line)
+                if line > 5:
+                    start_line = line - 5
+                end_line = line + 5
+
+            if url.startswith("http"):
+                _header, content = get_remote_file_content(url, as_text=True)
+            else:
+                content = url
+
+            if end_line != 0:
+                content = "".join(content.strip().splitlines()[start_line:end_line])
+
+            with open(name, "w") as out:
+                out.write(content)
+
+            yml = EMPTY_COPY_TEST
+            if url.startswith("http"):
+                yml = f"{yml}\nnotes: from {url}\n"
+
+            with open(f"{name}.yml", "w") as out:
+                out.write(yml)
+
+            if url.startswith("http"):
+                time.sleep(1)
+
+
+class RemoteNotFetchedException(Exception):
+    pass
+
+
+def get_remote_file_content(
+    url,
+    as_text=True,
+    headers_only=False,
+    headers=None,
+    _delay=0,
+):
+    """
+    Fetch and return a tuple of (headers, content) at `url`. Return content as a
+    text string if `as_text` is True. Otherwise return the content as bytes.
+
+    If `header_only` is True, return only (headers, None). Headers is a mapping
+    of HTTP headers.
+    Retries multiple times to fetch if there is a HTTP 429 throttling response
+    and this with an increasing delay.
+    """
+    time.sleep(_delay)
+    headers = headers or {}
+    # using a GET with stream=True ensure we get the the final header from
+    # several redirects and that we can ignore content there. A HEAD request may
+    # not get us this last header
+    print(f"    DOWNLOADING: {url}")
+    with requests.get(url, allow_redirects=True, stream=True, headers=headers) as response:
+        status = response.status_code
+        if status != requests.codes.ok:  # NOQA
+            if status == 429 and _delay < 20:
+                # too many requests: start some exponential delay
+                increased_delay = (_delay * 2) or 1
+
+                return get_remote_file_content(
+                    url,
+                    as_text=as_text,
+                    headers_only=headers_only,
+                    _delay=increased_delay,
+                )
+
+            else:
+                raise RemoteNotFetchedException(f"Failed HTTP request from {url} with {status}")
+
+        if headers_only:
+            return response.headers, None
+
+        return response.headers, response.text if as_text else response.content
+
+
+if __name__ == "__main__":
+    create_copyright_tests()
diff --git a/requirements.txt b/requirements.txt
@@ -10,7 +10,7 @@ chardet==5.0.0
 charset-normalizer==2.1.0
 click==8.1.7
 colorama==0.4.5
-commoncode==31.0.3
+commoncode==31.2.1
 construct==2.10.68
 container-inspector==31.1.0
 cryptography==42.0.5