Skip to content

Commit

Permalink
Improve copyright detection
Browse files Browse the repository at this point in the history
* Handle better various parens, markup and quotes

Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Jun 7, 2024
1 parent 6438377 commit 1f94c9d
Show file tree
Hide file tree
Showing 57 changed files with 647 additions and 37 deletions.
18 changes: 15 additions & 3 deletions src/cluecode/copyrights.py
Original file line number Diff line number Diff line change
Expand Up @@ -2389,7 +2389,7 @@ def build_detection_from_node(
NAME-EMAIL: {<NAME> <EMAIL>} #530
# Project Mayo.
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <NNP>?} #535
NAME-YEAR: {<PARENS>? <YR-RANGE> <NAME-EMAIL|COMPANY>+ <NNP>? <PARENS>?} #535
NAME-YEAR: {<YR-RANGE> <NAME-EMAIL|COMPANY>+ <CC> <YR-RANGE>} #540
Expand All @@ -2404,7 +2404,6 @@ def build_detection_from_node(
# Copyright 2018, OpenCensus Authors
COPYRIGHT: {<COPY>+ <YR-RANGE> <NNP> <AUTHS>} #1579991
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
#Academy of Motion Picture Arts and Sciences
Expand All @@ -2418,6 +2417,8 @@ def build_detection_from_node(
NAME-YEAR: {<YR-RANGE> <NAME>+ <CONTRIBUTORS>?} #570
URL: {<PARENS> <URL> <PARENS>} #5700
#also accept trailing email and URLs
NAME-YEAR: {<NAME-YEAR> <EMAIL>?<URL>?} #5701
NAME-YEAR: {<NAME-YEAR>+} #5702
Expand Down Expand Up @@ -2470,7 +2471,6 @@ def build_detection_from_node(
# this is catching a wide net by treating any bare URL as a company
COMPANY: {<NNP>? <URL|URL2>} #830
COMPANY: {<COMPANY> <COMP|COMPANY>} #840
# the Software and Component Technologies group of Trimble Navigation, Ltd.
Expand Down Expand Up @@ -2649,6 +2649,9 @@ def build_detection_from_node(
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
COPYRIGHT: {<COPY>+ <CAPS|NNP>+ <CC> <NN> <COPY> <YR-RANGE>?} #1590
# // (c) (C) → ©
Expand Down Expand Up @@ -2737,6 +2740,9 @@ def build_detection_from_node(
# (c) Copyright 1985-1999 SOME TECHNOLOGY SYSTEMS
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE> <CAPS> <CAPS> <CAPS>? <CAPS>?} #2271
# Minpack Copyright Notice (1999) University of Chicago
COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273.1
# NAME-COPY is a name with a trailing copyright
# Daisy (c) 1998
NAME-COPY: {<NNP> <COPY>} #2272
Expand Down Expand Up @@ -3081,6 +3087,12 @@ def build_detection_from_node(
# Copyright (c) 2014 The Rust Project Developers
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
# copyright its authors
COPYRIGHT: {<COPY> <NN> <AUTHS>} #83030
# Copyright: 2004-2007 by Internet Systems Consortium, Inc. ("ISC")
# 1995-2003 by Internet Software Consortium
COPYRIGHT: {<YR-RANGE> <BY> <COMPANY> } #1615
#######################################
# Copyright is held by ....
Expand Down
19 changes: 17 additions & 2 deletions src/textcode/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,22 @@ def demarkup(location):
yield demarkup_text(line)


get_tags_and_entities = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href|[\'"]?\/\>)', re.IGNORECASE).split
get_tags_and_entities = re.compile(
r'('
r'</?[^\s></]+(?:>'
r'|'
r'\s)?'
r'|'
r'&[^\s&]+;'
r'|'
r'href'
r'|'
'[\'"]?\/\>'
r'|'
r'/>'
r')',
re.IGNORECASE,
).split


def demarkup_text(text):
Expand Down Expand Up @@ -136,7 +151,7 @@ def demarkup_text(text):
cleaned_append = cleaned.append
for token in tags_and_ents:
tlow = token.lower()
if tlow.startswith(('<', '/>', '&', 'href',)) and not any(k in tlow for k in kept_tags):
if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not any(k in tlow for k in kept_tags):
cleaned_append(' ')
else:
cleaned_append(token)
Expand Down
2 changes: 1 addition & 1 deletion tests/cluecode/data/copyright_fossology/testdata87_raw
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ It was downloaded from http://ftp.isc.org/isc/dhcp/
Upstream <s>Author: Internet Systems Consortium (ISC) <[email protected]></s>

<s>Copyright 2004-2007 by Internet Systems Consortium, Inc.</s> ("ISC")
1995-2003 by Internet Software Consortium
<s>1995-2003 by Internet Software Consortium</s>

License:

Expand Down
8 changes: 4 additions & 4 deletions tests/cluecode/data/copyright_fossology/testdata93_raw
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ Copyright:
CTOCWidget.js:
<s>Copyright (c) 2003 The Netscape Corporation</s>.
xbCollapsibleLists.js:
<s>Copyright (c) 1997 Michael Bostock</s> (Netscape Communications).
<s>Copyright (c) 2001 Bob Clary</s> (Netscape Communications).
<s>Copyright (c) 2001 Seth Dillingham</s> (Macrobyte Resources).
<s>Copyright (c) 2002 Mark Filanowicz</s> (Amdahl IT Services).
<s>Copyright (c) 1997 Michael Bostock (Netscape Communications)</s>.
<s>Copyright (c) 2001 Bob Clary (Netscape Communications)</s>.
<s>Copyright (c) 2001 Seth Dillingham (Macrobyte Resources)</s>.
<s>Copyright (c) 2002 Mark Filanowicz (Amdahl IT Services)</s>.

Upstream <s>Author:

Expand Down
2 changes: 1 addition & 1 deletion tests/cluecode/data/copyrights/libcdio10-libcdio.label.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ copyrights:
- Copyright (c) 1985, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1996, 1997, 1998, 1999,
2000 Free Software Foundation, Inc.
- Copyright (c) 2003 Matthias Drochner
- Copyright (c) 1998-2001 VideoLAN ( Johan Bilien <[email protected]> and Gildas Bazin <[email protected]> )
- Copyright (c) 1998-2001 VideoLAN Johan Bilien <[email protected]> and Gildas Bazin <[email protected]>
- Copyright (c) 1992, 1993 Eric Youngdale
- Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 Rocky Bernstein and Herbert Valerio Riedel
holders:
Expand Down
1 change: 0 additions & 1 deletion tests/cluecode/data/copyrights/misco2/distributed_3.txt

This file was deleted.

11 changes: 0 additions & 11 deletions tests/cluecode/data/copyrights/misco2/distributed_3.txt.yml

This file was deleted.

8 changes: 8 additions & 0 deletions tests/cluecode/data/copyrights/misco2/its-authors.txt.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,11 @@ what:
- holders
- holders_summary
- authors
copyrights:
- copyright its authors
holders:
- its authors
holders_summary:
- value: its authors
count: 1

4 changes: 4 additions & 0 deletions tests/cluecode/data/copyrights/misco3/and-others.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Copyright © 1998-2009 Bill Spitzak
([email protected] ) and others, including:


12 changes: 12 additions & 0 deletions tests/cluecode/data/copyrights/misco3/and-others.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright (c) 1998-2009 Bill Spitzak ([email protected] ) and others
holders:
- Bill Spitzak and others
holders_summary:
- value: Bill Spitzak and others
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<p class="apache">Copyright 2009 The Apache Software Foundation.<br />
12 changes: 12 additions & 0 deletions tests/cluecode/data/copyrights/misco3/apache-foundation.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright 2009 The Apache Software Foundation
holders:
- The Apache Software Foundation
holders_summary:
- value: The Apache Software Foundation
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
This project is copyright its authors and licensed under either of

Apache License, Version 2.0, (LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0)
MIT license (LICENSE-MIT or http://opensource.org/licenses/MIT) at your option.

12 changes: 12 additions & 0 deletions tests/cluecode/data/copyrights/misco3/apache-mit-copyright.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- copyright its authors
holders:
- its authors
holders_summary:
- value: its authors
count: 1
3 changes: 3 additions & 0 deletions tests/cluecode/data/copyrights/misco3/apple-author.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

'cannot be modified' by Apple's SF Pro
Symbols Viewer v2.0.0
7 changes: 7 additions & 0 deletions tests/cluecode/data/copyrights/misco3/apple-author.txt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
what:
- copyrights
- holders
- holders_summary
- authors
authors:
- Apple's SF Pro
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Copyright (c) 2016-2020 Alexey Svetliakov <https://github.com/asvetliakov>,
snerks <https://github.com/snerks>, Krzysztof Cebula
<https://github.com/Havret>, Vitaliy Polyanskiy <https://github.com/alreadyExisted>,
James Lismore <https://github.com/jlismore>, Stack Builders <https://github.com/stackbuilders>,
Esteban Ibarra <https://github.com/ibarrae>, Dominic Lee
<https://github.com/dominictwlee>, Dave Vedder <https://github.com/veddermatic>,
Alec Flett <https://github.com/alecf> and potentially other
DefinitelyTyped contributors.
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright (c) 2016-2020 Alexey Svetliakov <https://github.com/asvetliakov> , snerks <https://github.com/snerks>
, Krzysztof Cebula <https://github.com/Havret> , Vitaliy Polyanskiy <https://github.com/alreadyExisted>
, James Lismore <https://github.com/jlismore> , Stack Builders <https://github.com/stackbuilders>
, Esteban Ibarra <https://github.com/ibarrae> , Dominic Lee <https://github.com/dominictwlee>
, Dave Vedder <https://github.com/veddermatic> , Alec Flett <https://github.com/alecf>
holders:
- Alexey Svetliakov , snerks , Krzysztof Cebula , Vitaliy Polyanskiy , James Lismore , Stack
Builders , Esteban Ibarra , Dominic Lee , Dave Vedder , Alec Flett
holders_summary:
- value: Alexey Svetliakov , snerks , Krzysztof Cebula , Vitaliy Polyanskiy , James Lismore
, Stack Builders , Esteban Ibarra , Dominic Lee , Dave Vedder , Alec Flett
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
or Kana; or (c) the Republic of Ireland, if a license to the Software
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
what:
- copyrights
- holders
- holders_summary
- authors
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The copyright of UC Berkeley's Berkeley Software Distribution ("BSD")
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- copyright of UC Berkeley's Berkeley Software
holders:
- UC Berkeley's Berkeley Software
holders_summary:
- value: UC Berkeley's Berkeley Software
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
* The libidn code is copyright Simon Josefsson, with portions copyright
The Internet Society, Tom Tromey and Red Hat, Inc.:
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- copyright Simon Josefsson
- portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
holders:
- Simon Josefsson
- The Internet Society, Tom Tromey and Red Hat, Inc.
holders_summary:
- value: Simon Josefsson
count: 1
- value: The Internet Society, Tom Tromey and Red Hat, Inc.
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Minpack Copyright Notice (1999) University of Chicago
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright Notice (1999) University of Chicago
holders:
- University of Chicago
holders_summary:
- value: University of Chicago
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
* Copyright (C) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos.





Copyright (c) Distributed Management Task Force
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos
- Copyright (c) Distributed Management Task Force
holders:
- GSyC/LibreSoft, Universidad Rey Juan Carlos
- Distributed Management Task Force
holders_summary:
- value: Distributed Management Task Force
count: 1
- value: GSyC/LibreSoft, Universidad Rey Juan Carlos
count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
The MIT License (MIT)

Copyright (c) 2015-2020 Formidable Labs.

Copyright (c) 2016-2020 Alexey Svetliakov <https://github.com/asvetliakov>,
snerks <https://github.com/snerks>, Krzysztof Cebula <https://github.com
Havret>, Vitaliy Polyanskiy <https://github.com/alreadyExisted>, James Lismore
<https://github.com/jlismore>, Stack Builders <https://github.com
stackbuilders>, Esteban Ibarra <https://github.com/ibarrae>, Dominic Lee
<https://github.com/dominictwlee>, Dave Vedder <https://github.com
veddermatic>, Alec Flett <https://github.com/alecf> and potentially other
DefinitelyTyped contributors.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
what:
- copyrights
- holders
- holders_summary
- authors
copyrights:
- Copyright (c) 2015-2020 Formidable Labs
- Copyright (c) 2016-2020 Alexey Svetliakov <https://github.com/asvetliakov> , snerks <https://github.com/snerks>
, Krzysztof Cebula https://github.com
holders:
- Formidable Labs
- Alexey Svetliakov , snerks , Krzysztof Cebula
holders_summary:
- value: Alexey Svetliakov , snerks , Krzysztof Cebula
count: 1
- value: Formidable Labs
count: 1
Loading

0 comments on commit 1f94c9d

Please sign in to comment.