From b1cef69aaced6722a02ff43a16b9976e9f35b66e Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 23 Jul 2024 14:58:17 +0200 Subject: [PATCH] Improve package datafile handlers * Add new attributes to the DatafileHandler class for a datasource type and the supported operating systems * Add and test new validate() method to validate DatafileHandler collection correctness * Apply minor refactorings and code formatting Signed-off-by: Philippe Ombredanne --- src/packagedcode/__init__.py | 7 ++- src/packagedcode/alpine.py | 1 + src/packagedcode/debian.py | 4 ++ src/packagedcode/maven.py | 24 +++++---- src/packagedcode/models.py | 82 ++++++++++++++++++++++++------- src/packagedcode/msi.py | 1 + src/packagedcode/pypi.py | 2 +- src/packagedcode/rpm.py | 8 +++ src/packagedcode/utils.py | 4 +- src/packagedcode/win_reg.py | 6 +++ tests/packagedcode/test_models.py | 37 ++++++++++++++ 11 files changed, 141 insertions(+), 35 deletions(-) create mode 100644 tests/packagedcode/test_models.py diff --git a/src/packagedcode/__init__.py b/src/packagedcode/__init__.py index 629a1655de2..6ab213eb590 100644 --- a/src/packagedcode/__init__.py +++ b/src/packagedcode/__init__.py @@ -7,8 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import attr - from commoncode.system import on_linux from packagedcode import about from packagedcode import alpine @@ -254,6 +252,7 @@ ] ) +# registry of all handler classes keyed by datasource_id HANDLER_BY_DATASOURCE_ID = {handler.datasource_id: handler for handler in ALL_DATAFILE_HANDLERS} @@ -263,8 +262,8 @@ class UnknownPackageDatasource(Exception): def get_package_handler(package_data): """ - Return the DatafileHandler class that corresponds to a ``package_data`` - PackageData object. Raise a UnknownPackageDatasource error if the + Return the DatafileHandler class that for a ``package_data`` + PackageData class datasource_id. Raise a UnknownPackageDatasource error if the DatafileHandler is not found. """ ppc = HANDLER_BY_DATASOURCE_ID.get(package_data.datasource_id) diff --git a/src/packagedcode/alpine.py b/src/packagedcode/alpine.py index 1b155a90c97..8206d8232f1 100644 --- a/src/packagedcode/alpine.py +++ b/src/packagedcode/alpine.py @@ -58,6 +58,7 @@ def get_license_detections_and_expression(package): class AlpineInstalledDatabaseHandler(models.DatafileHandler): datasource_id = 'alpine_installed_db' + datasource_type = 'sys' path_patterns = ('*lib/apk/db/installed',) default_package_type = 'alpine' description = 'Alpine Linux installed package database' diff --git a/src/packagedcode/debian.py b/src/packagedcode/debian.py index ff1d618fd20..173e3342107 100644 --- a/src/packagedcode/debian.py +++ b/src/packagedcode/debian.py @@ -223,6 +223,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) class DebianInstalledStatusDatabaseHandler(models.DatafileHandler): datasource_id = 'debian_installed_status_db' + datasource_type = 'sys' default_package_type = 'deb' path_patterns = ('*var/lib/dpkg/status',) description = 'Debian installed packages database' @@ -391,6 +392,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler): datasource_id = 'debian_distroless_installed_db' + datasource_type = 'sys' default_package_type = 'deb' path_patterns = ('*var/lib/dpkg/status.d/*',) description = 'Debian distroless installed database' @@ -474,6 +476,7 @@ class DebianInstalledFilelistHandler(models.DatafileHandler): # seen in installed rootfs in: # - /var/lib/dpkg/info/.list datasource_id = 'debian_installed_files_list' + datasource_type = 'sys' default_package_type = 'deb' path_patterns = ( '*var/lib/dpkg/info/*.list', @@ -499,6 +502,7 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler): # - /var/lib/dpkg/info/.md5sums # - /var/lib/dpkg/info/.md5sums datasource_id = 'debian_installed_md5sums' + datasource_type = 'sys' default_package_type = 'deb' path_patterns = ( '*var/lib/dpkg/info/*.md5sums', diff --git a/src/packagedcode/maven.py b/src/packagedcode/maven.py index 1498add8cf0..a60b3c4b3d1 100644 --- a/src/packagedcode/maven.py +++ b/src/packagedcode/maven.py @@ -55,6 +55,7 @@ there is no pom.properties check if there are side-by-side artifacts """ + class MavenBasePackageHandler(models.DatafileHandler): @classmethod @@ -71,7 +72,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_ datafile_path = resource.path # This order is important as we want pom.xml to be used for package - # creation and then to update from MANIFEST later + # creation and then to update from MANIFEST later manifest_path_pattern = '*/META-INF/MANIFEST.MF' nested_pom_xml_path_pattern = '*/META-INF/maven/**/pom.xml' datafile_name_patterns = (nested_pom_xml_path_pattern, manifest_path_pattern) @@ -103,7 +104,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_ return if manifests and pom_xmls: - #raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder) + # raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder) parent_resource = meta_inf_resource.parent(codebase) if not parent_resource: parent_resource = meta_inf_resource @@ -272,7 +273,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) for child in root.walk(codebase): if 'pom.xml' in child.path: number_poms += 1 - + if number_poms > 1: root = resource else: @@ -315,7 +316,7 @@ def parse(cls, location, package_only=False): if TRACE: logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}') if properties: - yield from cls.parse_pom_properties(properties=properties, package_only=package_only) + yield from cls.parse_pom_properties(properties=properties, package_only=package_only) @classmethod def parse_pom_properties(cls, properties, package_only=False): @@ -1308,11 +1309,14 @@ def _parse( ) return MavenPackageData.from_data(package_data, package_only) + class MavenPackageData(models.PackageData): datasource_id = 'maven_pom' + @classmethod def get_license_detections_for_extracted_license_statement( + cls, extracted_license, try_as_expression=True, approximate=True, @@ -1321,16 +1325,16 @@ def get_license_detections_for_extracted_license_statement( from packagedcode.licensing import get_normalized_license_detections from packagedcode.licensing import get_license_detections_for_extracted_license_statement - if not MavenPackageData.check_extracted_license_statement_structure(extracted_license): + if not cls.check_extracted_license_statement_structure(extracted_license): return get_normalized_license_detections( extracted_license=extracted_license, try_as_expression=try_as_expression, approximate=approximate, expression_symbols=expression_symbols, ) - + new_extracted_license = extracted_license.copy() - + for license_entry in new_extracted_license: license_entry.pop("distribution") if not license_entry.get("name"): @@ -1349,8 +1353,8 @@ def get_license_detections_for_extracted_license_statement( expression_symbols=expression_symbols, ) - - def check_extracted_license_statement_structure(extracted_license): + @classmethod + def check_extracted_license_statement_structure(cls, extracted_license): is_list_of_mappings = False if not isinstance(extracted_license, list): @@ -1362,7 +1366,7 @@ def check_extracted_license_statement_structure(extracted_license): if not isinstance(extracted_license_item, dict): is_list_of_mappings = False break - + return is_list_of_mappings diff --git a/src/packagedcode/models.py b/src/packagedcode/models.py index 26c6b6d0c8c..4b3dee40e75 100644 --- a/src/packagedcode/models.py +++ b/src/packagedcode/models.py @@ -7,18 +7,18 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import logging import os import uuid -from fnmatch import fnmatchcase -import logging import sys +from fnmatch import fnmatchcase + import attr -from packageurl import normalize_qualifiers -from packageurl import PackageURL import saneyaml from commoncode import filetype +from commoncode.fileutils import as_posixpath from commoncode.datautils import choices from commoncode.datautils import Boolean from commoncode.datautils import Date @@ -26,10 +26,11 @@ from commoncode.datautils import List from commoncode.datautils import Mapping from commoncode.datautils import String -from commoncode.fileutils import as_posixpath from commoncode.resource import Resource from license_expression import combine_expressions from license_expression import Licensing +from packageurl import normalize_qualifiers +from packageurl import PackageURL try: from typecode import contenttype @@ -41,6 +42,7 @@ except ImportError: licensing = None +# FIXME: what if licensing is not importable? from packagedcode.licensing import get_declared_license_expression_spdx """ @@ -963,7 +965,7 @@ def get_license_detections_and_expression(self): return [], None if self.datasource_id: - default_relation_license=get_default_relation_license( + default_relation_license = get_default_relation_license( datasource_id=self.datasource_id, ) else: @@ -1017,12 +1019,11 @@ def add_to_package(package_uid, resource, codebase): class DatafileHandler: """ - A base handler class to handle any package manifests, lockfiles and data - files. Each subclass handles a package datafile format to parse datafiles - and assemble Package and Depdencies from these: + A base handler class to handle any package manifest, lockfile, package database + and related data files. Each subclass handles a package datafile format to parse + datafiles and assemble Package and Dependencies from these: - parses a datafile format and yields package data. - - assembles this datafile package data in top-level packages and dependencies - assigns package files to their package """ @@ -1033,6 +1034,16 @@ class DatafileHandler: # can only contain ASCII letters, digits and underscore. Must be lowercase datasource_id = None + # style of package data processed by this handler, either app for application package like npm, + # sys for system packages like rpm, or info for informational data file that provides hints but + # is not a package manifest, like with a README file + # possible values are app, sys and info + datasource_type = 'app' + + # tuple of specifically supported operating systems. If None or empty, all platforms are supported + # possible values are win, mac, linux, freebsd + supported_oses = tuple() + # Sequence of known fnmatch-style case-insensitive glob patterns (e.g., Unix # shell style patterns) that apply on the whole POSIX path for package # datafiles recognized and parsed by this parser. See fnmatch.fnmatch(). @@ -1053,7 +1064,7 @@ class DatafileHandler: # Informational: Default primary language for this parser. default_primary_language = None - # If the datafilehandler contains only resolved dependencies + # If the handler is for a lockfile that contains locked/pinned, pre-resolved dependencies is_lockfile = False # Informational: Description of this parser @@ -1062,7 +1073,9 @@ class DatafileHandler: # Informational: URL that documents this file format documentation_url = None - # Default Relation between license elements detected in an `extracted_license_statement` + # Default license expression relation between the license detected in an + # `extracted_license_statement` for this data file. + # This may vary for each data file based on conventions and specifications. default_relation_license = None @classmethod @@ -1491,11 +1504,44 @@ def get_top_level_resources(cls, manifest_resource, codebase): """ pass + @classmethod + def validate(cls): + """ + Validate this class. + Raise ImproperlyConfiguredDatafileHandler exception on errors. + """ + + did = cls.datasource_id + if not did: + raise ImproperlyConfiguredDatafileHandler( + f'The handler {cls!r} has an empty datasource_id {did!r}.') + + DATASOURCE_TYPES = 'app', 'sys', 'info', + dfs = cls.datasource_type + if dfs not in DATASOURCE_TYPES: + raise ImproperlyConfiguredDatafileHandler( + f'The handler {did!r} : {cls!r} has an invalid ' + f'datasource_type: {dfs!r}: must be one of {DATASOURCE_TYPES!r}.' + ) + + oses = 'linux', 'win', 'max', 'freebsd', + soses = cls.supported_oses + if soses and not all(s in oses for s in soses): + raise ImproperlyConfiguredDatafileHandler( + f'The handler {cls.datasource_id!r} : {cls!r} has invalid ' + f'supported_oses: {soses!r}: must be empty or among {oses!r}' + ) + + +class ImproperlyConfiguredDatafileHandler(Exception): + """ScanCode Package Datafile Handler is not properly configured""" + pass + class NonAssemblableDatafileHandler(DatafileHandler): """ - A handler that has no default implmentation for the assemble method, e.g., - it will not alone trigger the creation of a top-level Pacakge. + A handler with a default implementation of an assemble method doing nothing, e.g., + it will not alone trigger the creation of a top-level Package. """ @classmethod @@ -1528,8 +1574,8 @@ def build_purl(mapping): subpath = mapping.get('subpath') return PackageURL( type=ptype, - name=name, namespace=namespace, + name=name, version=version, qualifiers=qualifiers, subpath=subpath, @@ -1601,10 +1647,10 @@ def from_package_data(cls, package_data, datafile_path, package_only=False): license_match['from_file'] = datafile_path package = cls.from_dict(package_data_mapping) - + if not package.package_uid: package.package_uid = build_package_uid(package.purl) - + if not package_only: package.populate_license_fields() package.populate_holder_field() @@ -1763,7 +1809,7 @@ def refresh_license_expressions(self, default_relation='AND'): self.declared_license_expression_spdx = get_declared_license_expression_spdx( declared_license_expression=self.declared_license_expression, ) - + if self.other_license_detections: self.other_license_expression = str(combine_expressions( expressions=[ diff --git a/src/packagedcode/msi.py b/src/packagedcode/msi.py index 5ce1ba534ed..d36ab1d8ff0 100644 --- a/src/packagedcode/msi.py +++ b/src/packagedcode/msi.py @@ -195,6 +195,7 @@ class MsiInstallerHandler(models.DatafileHandler): default_package_type = 'msi' description = 'Microsoft MSI installer' documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal' + supported_oses = ('linux',) @classmethod def parse(cls, location, package_only=False): diff --git a/src/packagedcode/pypi.py b/src/packagedcode/pypi.py index dcfe261d946..3a09662cb11 100644 --- a/src/packagedcode/pypi.py +++ b/src/packagedcode/pypi.py @@ -1093,7 +1093,7 @@ def parse(cls, location, package_only=False): package_only=package_only, ) - +# FIXME: this is NOT used class PypiSdistArchiveHandler(models.DatafileHandler): datasource_id = 'pypi_sdist' path_patterns = ('*.tar.gz', '*.tar.bz2', '*.zip',) diff --git a/src/packagedcode/rpm.py b/src/packagedcode/rpm.py index e24d94c636d..4cce7e8d1f3 100644 --- a/src/packagedcode/rpm.py +++ b/src/packagedcode/rpm.py @@ -229,6 +229,8 @@ def assemble(cls, package_data, resource, codebase, package_adder): class RpmInstalledNdbDatabaseHandler(BaseRpmInstalledDatabaseHandler): # used by recent Suse datasource_id = 'rpm_installed_database_ndb' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*usr/lib/sysimage/rpm/Packages.db',) default_package_type = 'rpm' default_package_namespace = 'TBD' @@ -243,6 +245,8 @@ class RpmInstalledSqliteDatabaseHandler(BaseRpmInstalledDatabaseHandler): # Mimetype: application/vnd.sqlite3 datasource_id = 'rpm_installed_database_sqlite' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*rpm/rpmdb.sqlite',) default_package_type = 'rpm' default_package_namespace = 'TBD' @@ -254,6 +258,8 @@ class RpmInstalledSqliteDatabaseHandler(BaseRpmInstalledDatabaseHandler): class RpmInstalledBdbDatabaseHandler(BaseRpmInstalledDatabaseHandler): # used by legacy RHEL/CentOS/Fedora/Suse datasource_id = 'rpm_installed_database_bdb' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*var/lib/rpm/Packages',) filetypes = ('berkeley',) default_package_type = 'rpm' @@ -381,6 +387,7 @@ def parse(cls, location, package_only=False): class RpmMarinerContainerManifestHandler(models.DatafileHandler): datasource_id = 'rpm_mariner_manifest' + datasource_type = 'sys' # container-manifest-1 is more minimal and has the same data path_patterns = ('*var/lib/rpmmanifest/container-manifest-2',) default_package_type = 'rpm' @@ -502,6 +509,7 @@ def clean_mariner_manifest_data(package_data): class RpmLicenseFilesHandler(models.NonAssemblableDatafileHandler): datasource_id = 'rpm_package_licenses' + datasource_type = 'sys' path_patterns = ( '*usr/share/licenses/*/COPYING*', '*usr/share/licenses/*/LICENSE*', diff --git a/src/packagedcode/utils.py b/src/packagedcode/utils.py index 51520002e98..f6d032bed8b 100644 --- a/src/packagedcode/utils.py +++ b/src/packagedcode/utils.py @@ -299,8 +299,8 @@ def get_base_purl(purl): def is_simple_path(path): - return '*' not in path + return '*' not in path def is_simple_path_pattern(path): - return path.endswith('*') and path.count('*') == 1 + return path.endswith('*') and path.count('*') == 1 diff --git a/src/packagedcode/win_reg.py b/src/packagedcode/win_reg.py index aac41691f01..8e8327f06e8 100644 --- a/src/packagedcode/win_reg.py +++ b/src/packagedcode/win_reg.py @@ -433,6 +433,8 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) class InstalledProgramFromDockerSoftwareDeltaHandler(BaseRegInstalledProgramHandler): datasource_id = 'win_reg_installed_programs_docker_software_delta' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*/Hives/Software_Delta',) description = 'Windows Registry Installed Program - Docker Software Delta' root_path_relative_to_datafile_path = '../../Files' @@ -440,6 +442,8 @@ class InstalledProgramFromDockerSoftwareDeltaHandler(BaseRegInstalledProgramHand class InstalledProgramFromDockerFilesSoftwareHandler(BaseRegInstalledProgramHandler): datasource_id = 'win_reg_installed_programs_docker_file_software' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*/Files/Windows/System32/config/SOFTWARE',) description = 'Windows Registry Installed Program - Docker SOFTWARE' root_path_relative_to_datafile_path = '../../../..' @@ -447,6 +451,8 @@ class InstalledProgramFromDockerFilesSoftwareHandler(BaseRegInstalledProgramHand class InstalledProgramFromDockerUtilityvmSoftwareHandler(BaseRegInstalledProgramHandler): datasource_id = 'win_reg_installed_programs_docker_utility_software' + datasource_type = 'sys' + supported_oses = ('linux',) path_patterns = ('*/UtilityVM/Files/Windows/System32/config/SOFTWARE',) description = 'Windows Registry Installed Program - Docker UtilityVM SOFTWARE' root_path_relative_to_datafile_path = '../../../..' diff --git a/tests/packagedcode/test_models.py b/tests/packagedcode/test_models.py new file mode 100644 index 00000000000..725ba9d5042 --- /dev/null +++ b/tests/packagedcode/test_models.py @@ -0,0 +1,37 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/scancode-toolkit for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import pytest + +from packagedcode import ALL_DATAFILE_HANDLERS +from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS +from packagedcode import SYSTEM_PACKAGE_DATAFILE_HANDLERS +from packagedcode.debian_copyright import DebianCopyrightFileInPackageHandler + + +@pytest.mark.parametrize('datafile_handler', ALL_DATAFILE_HANDLERS) +def test_validate_datafile_handlers(datafile_handler): + datafile_handler.validate() + + +def test_check_datafile_handlers_have_no_duplicate_datasource_id(): + seen_datasource_id = set() + + for dfh in sorted(set(APPLICATION_PACKAGE_DATAFILE_HANDLERS + SYSTEM_PACKAGE_DATAFILE_HANDLERS), key=str): + assert dfh.datasource_id not in seen_datasource_id + seen_datasource_id.add(dfh.datasource_id) + + +def test_check_datafile_handlers_have_no_duplicated_entries(): + app_handlers = set(APPLICATION_PACKAGE_DATAFILE_HANDLERS) + sys_handlers = set(SYSTEM_PACKAGE_DATAFILE_HANDLERS) + + dupes = app_handlers.intersection(sys_handlers) + expected = set([DebianCopyrightFileInPackageHandler]) + assert dupes == expected