Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve copyrights detection #3752

Merged
merged 29 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
c206f91
Detect odd name in copyright #3655
pombredanne Apr 8, 2024
54b2309
Do not detect trailing Distributed in copyright #3735
pombredanne Apr 10, 2024
5215ef4
Improve misc. copyright detections
pombredanne Apr 10, 2024
d1cf644
Add new script to generate copyright tests
pombredanne Apr 12, 2024
5294521
Improve copyright detection
pombredanne Apr 12, 2024
ab6699f
Detect NN/EMAIL copyright combo #3764
pombredanne May 6, 2024
753c492
Merge remote-tracking branch 'upstream/develop' into misc-copyrights
pombredanne May 9, 2024
ca8efbd
Detect NN/EMAIL copyright combo #3764
pombredanne May 9, 2024
2354701
Align license with improved copyrights
pombredanne May 9, 2024
00a9abc
Improve copyright detection of "distributed"
pombredanne May 9, 2024
dd5de6e
Do not detect some words as NNP
pombredanne May 9, 2024
97d0bcb
Improve copyright tests
pombredanne May 9, 2024
6a9663e
Detect OpenStreetMap correctly
pombredanne May 9, 2024
12b7ace
Add new copyright detection tests
pombredanne May 9, 2024
0d7df58
Improve copyright detection side-effects
pombredanne May 10, 2024
8b2ddf5
Enable generation of copyright test file
pombredanne May 10, 2024
1fc7cea
Improve copyright debug tracing
pombredanne May 10, 2024
b0a6e26
Detect new form of copyright
pombredanne May 10, 2024
7ff3f8e
Do not add arbitrary space around markup
pombredanne May 12, 2024
850edc1
Improve handle of parens in copyright
pombredanne May 13, 2024
f3f2c78
Correctly filter copyrights in licenses #3797
pombredanne Jun 7, 2024
461fd65
Improve copyright detection
pombredanne Jun 7, 2024
6438377
Rename README file
pombredanne Jun 7, 2024
1f94c9d
Improve copyright detection
pombredanne Jun 7, 2024
3bccb31
Improve copyright detection
pombredanne Jun 21, 2024
412358a
Refine copyright detection
pombredanne Jun 21, 2024
87d5559
Use latest commoncode
pombredanne Jun 22, 2024
35366f0
Enable generation of copyright test data files
pombredanne Jun 22, 2024
f07eaee
Do not regen demarkup tests
pombredanne Jun 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions etc/scripts/gen_copyright_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/skeleton for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import time

from datetime import datetime

import click
import requests


def timestamp():
return datetime.utcnow().isoformat().split("T")[0]


EMPTY_COPY_TEST = """what:
- copyrights
- holders
copyrights:
holders:
"""


@click.command()
@click.option(
"-u",
"--urls",
"urls_file",
type=click.Path(exists=True, readable=True, path_type=str, dir_okay=False),
metavar="URLS-FILE",
multiple=False,
required=True,
help="Path to URLs file, one per line.",
)
@click.help_option("-h", "--help")
def create_copyright_tests(
urls_file,
):
"""
Download the URLs listed in the URLS-FILE and create a copyight test for each in the current
directory.

If a line number is provided as a URL fragment #L2, uses only 5 lines before and after this
line.

If the URL is a plain GitHub URL, convert the URL to a raw URL.
If the URL does not start with http it is treated as a plain copyright text to test
"""

with open(urls_file) as urls:
for i, url in enumerate(urls):
url = url.strip()
if not url:
continue

name = ""
if url.startswith("http"):
print(f"Fetching URL: {url}")
if url.startswith("https://github.com"):
url = url.replace("https://github.com", "https://raw.githubusercontent.com")
url = url.replace("/blob/", "/")

if "github" in url:
segs = url.split("/")
org = segs[3]
repo = segs[4]
name = f"copyright-test-{timestamp()}-{i}-{org}-{repo}.copyright"
else:
print(f"Processing test: {url}")
name = f"copyright-test-{timestamp()}-{i}.copyright"


start_line = 0
end_line = 0
if "#L" in url:
_, _, line = url.rpartition("#L")
line = int(line)
if line > 5:
start_line = line - 5
end_line = line + 5

if url.startswith("http"):
_header, content = get_remote_file_content(url, as_text=True)
else:
content = url

if end_line != 0:
content = "".join(content.strip().splitlines()[start_line:end_line])

with open(name, "w") as out:
out.write(content)

yml = EMPTY_COPY_TEST
if url.startswith("http"):
yml = f"{yml}\nnotes: from {url}\n"

with open(f"{name}.yml", "w") as out:
out.write(yml)

if url.startswith("http"):
time.sleep(1)


class RemoteNotFetchedException(Exception):
pass


def get_remote_file_content(
url,
as_text=True,
headers_only=False,
headers=None,
_delay=0,
):
"""
Fetch and return a tuple of (headers, content) at `url`. Return content as a
text string if `as_text` is True. Otherwise return the content as bytes.

If `header_only` is True, return only (headers, None). Headers is a mapping
of HTTP headers.
Retries multiple times to fetch if there is a HTTP 429 throttling response
and this with an increasing delay.
"""
time.sleep(_delay)
headers = headers or {}
# using a GET with stream=True ensure we get the the final header from
# several redirects and that we can ignore content there. A HEAD request may
# not get us this last header
print(f" DOWNLOADING: {url}")
with requests.get(url, allow_redirects=True, stream=True, headers=headers) as response:
status = response.status_code
if status != requests.codes.ok: # NOQA
if status == 429 and _delay < 20:
# too many requests: start some exponential delay
increased_delay = (_delay * 2) or 1

return get_remote_file_content(
url,
as_text=as_text,
headers_only=headers_only,
_delay=increased_delay,
)

else:
raise RemoteNotFetchedException(f"Failed HTTP request from {url} with {status}")

if headers_only:
return response.headers, None

return response.headers, response.text if as_text else response.content


if __name__ == "__main__":
create_copyright_tests()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ chardet==5.0.0
charset-normalizer==2.1.0
click==8.1.7
colorama==0.4.5
commoncode==31.0.3
commoncode==31.2.1
construct==2.10.68
container-inspector==31.1.0
cryptography==42.0.5
Expand Down
Loading
Loading