Skip to content

Commit

Permalink
Improve package datafile handlers
Browse files Browse the repository at this point in the history
* Add new attributes to the DatafileHandler class for a datasource type
  and the supported operating systems
* Add and test new validate() method to validate DatafileHandler
  collection correctness
* Apply minor refactorings and code formatting

Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Sep 4, 2024
1 parent dd675aa commit b1cef69
Show file tree
Hide file tree
Showing 11 changed files with 141 additions and 35 deletions.
7 changes: 3 additions & 4 deletions src/packagedcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import attr

from commoncode.system import on_linux
from packagedcode import about
from packagedcode import alpine
Expand Down Expand Up @@ -254,6 +252,7 @@
]
)

# registry of all handler classes keyed by datasource_id
HANDLER_BY_DATASOURCE_ID = {handler.datasource_id: handler for handler in ALL_DATAFILE_HANDLERS}


Expand All @@ -263,8 +262,8 @@ class UnknownPackageDatasource(Exception):

def get_package_handler(package_data):
"""
Return the DatafileHandler class that corresponds to a ``package_data``
PackageData object. Raise a UnknownPackageDatasource error if the
Return the DatafileHandler class that for a ``package_data``
PackageData class datasource_id. Raise a UnknownPackageDatasource error if the
DatafileHandler is not found.
"""
ppc = HANDLER_BY_DATASOURCE_ID.get(package_data.datasource_id)
Expand Down
1 change: 1 addition & 0 deletions src/packagedcode/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def get_license_detections_and_expression(package):

class AlpineInstalledDatabaseHandler(models.DatafileHandler):
datasource_id = 'alpine_installed_db'
datasource_type = 'sys'
path_patterns = ('*lib/apk/db/installed',)
default_package_type = 'alpine'
description = 'Alpine Linux installed package database'
Expand Down
4 changes: 4 additions & 0 deletions src/packagedcode/debian.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)

class DebianInstalledStatusDatabaseHandler(models.DatafileHandler):
datasource_id = 'debian_installed_status_db'
datasource_type = 'sys'
default_package_type = 'deb'
path_patterns = ('*var/lib/dpkg/status',)
description = 'Debian installed packages database'
Expand Down Expand Up @@ -391,6 +392,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):

class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler):
datasource_id = 'debian_distroless_installed_db'
datasource_type = 'sys'
default_package_type = 'deb'
path_patterns = ('*var/lib/dpkg/status.d/*',)
description = 'Debian distroless installed database'
Expand Down Expand Up @@ -474,6 +476,7 @@ class DebianInstalledFilelistHandler(models.DatafileHandler):
# seen in installed rootfs in:
# - /var/lib/dpkg/info/<package name>.list
datasource_id = 'debian_installed_files_list'
datasource_type = 'sys'
default_package_type = 'deb'
path_patterns = (
'*var/lib/dpkg/info/*.list',
Expand All @@ -499,6 +502,7 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler):
# - /var/lib/dpkg/info/<package name>.md5sums
# - /var/lib/dpkg/info/<package name:arch>.md5sums
datasource_id = 'debian_installed_md5sums'
datasource_type = 'sys'
default_package_type = 'deb'
path_patterns = (
'*var/lib/dpkg/info/*.md5sums',
Expand Down
24 changes: 14 additions & 10 deletions src/packagedcode/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
there is no pom.properties check if there are side-by-side artifacts
"""


class MavenBasePackageHandler(models.DatafileHandler):

@classmethod
Expand All @@ -71,7 +72,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
datafile_path = resource.path

# This order is important as we want pom.xml to be used for package
# creation and then to update from MANIFEST later
# creation and then to update from MANIFEST later
manifest_path_pattern = '*/META-INF/MANIFEST.MF'
nested_pom_xml_path_pattern = '*/META-INF/maven/**/pom.xml'
datafile_name_patterns = (nested_pom_xml_path_pattern, manifest_path_pattern)
Expand Down Expand Up @@ -103,7 +104,7 @@ def assemble(cls, package_data, resource, codebase, package_adder=models.add_to_
return

if manifests and pom_xmls:
#raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
# raise Exception(resource.path, meta_inf_resource, datafile_name_patterns, package_adder)
parent_resource = meta_inf_resource.parent(codebase)
if not parent_resource:
parent_resource = meta_inf_resource
Expand Down Expand Up @@ -272,7 +273,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder)
for child in root.walk(codebase):
if 'pom.xml' in child.path:
number_poms += 1

if number_poms > 1:
root = resource
else:
Expand Down Expand Up @@ -315,7 +316,7 @@ def parse(cls, location, package_only=False):
if TRACE:
logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}')
if properties:
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)
yield from cls.parse_pom_properties(properties=properties, package_only=package_only)

@classmethod
def parse_pom_properties(cls, properties, package_only=False):
Expand Down Expand Up @@ -1308,11 +1309,14 @@ def _parse(
)
return MavenPackageData.from_data(package_data, package_only)


class MavenPackageData(models.PackageData):

datasource_id = 'maven_pom'

@classmethod
def get_license_detections_for_extracted_license_statement(
cls,
extracted_license,
try_as_expression=True,
approximate=True,
Expand All @@ -1321,16 +1325,16 @@ def get_license_detections_for_extracted_license_statement(
from packagedcode.licensing import get_normalized_license_detections
from packagedcode.licensing import get_license_detections_for_extracted_license_statement

if not MavenPackageData.check_extracted_license_statement_structure(extracted_license):
if not cls.check_extracted_license_statement_structure(extracted_license):
return get_normalized_license_detections(
extracted_license=extracted_license,
try_as_expression=try_as_expression,
approximate=approximate,
expression_symbols=expression_symbols,
)

new_extracted_license = extracted_license.copy()

for license_entry in new_extracted_license:
license_entry.pop("distribution")
if not license_entry.get("name"):
Expand All @@ -1349,8 +1353,8 @@ def get_license_detections_for_extracted_license_statement(
expression_symbols=expression_symbols,
)


def check_extracted_license_statement_structure(extracted_license):
@classmethod
def check_extracted_license_statement_structure(cls, extracted_license):

is_list_of_mappings = False
if not isinstance(extracted_license, list):
Expand All @@ -1362,7 +1366,7 @@ def check_extracted_license_statement_structure(extracted_license):
if not isinstance(extracted_license_item, dict):
is_list_of_mappings = False
break

return is_list_of_mappings


Expand Down
82 changes: 64 additions & 18 deletions src/packagedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,30 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
import os
import uuid
from fnmatch import fnmatchcase
import logging
import sys

from fnmatch import fnmatchcase

import attr
from packageurl import normalize_qualifiers
from packageurl import PackageURL
import saneyaml

from commoncode import filetype
from commoncode.fileutils import as_posixpath
from commoncode.datautils import choices
from commoncode.datautils import Boolean
from commoncode.datautils import Date
from commoncode.datautils import Integer
from commoncode.datautils import List
from commoncode.datautils import Mapping
from commoncode.datautils import String
from commoncode.fileutils import as_posixpath
from commoncode.resource import Resource
from license_expression import combine_expressions
from license_expression import Licensing
from packageurl import normalize_qualifiers
from packageurl import PackageURL

try:
from typecode import contenttype
Expand All @@ -41,6 +42,7 @@
except ImportError:
licensing = None

# FIXME: what if licensing is not importable?
from packagedcode.licensing import get_declared_license_expression_spdx

"""
Expand Down Expand Up @@ -963,7 +965,7 @@ def get_license_detections_and_expression(self):
return [], None

if self.datasource_id:
default_relation_license=get_default_relation_license(
default_relation_license = get_default_relation_license(
datasource_id=self.datasource_id,
)
else:
Expand Down Expand Up @@ -1017,12 +1019,11 @@ def add_to_package(package_uid, resource, codebase):

class DatafileHandler:
"""
A base handler class to handle any package manifests, lockfiles and data
files. Each subclass handles a package datafile format to parse datafiles
and assemble Package and Depdencies from these:
A base handler class to handle any package manifest, lockfile, package database
and related data files. Each subclass handles a package datafile format to parse
datafiles and assemble Package and Dependencies from these:
- parses a datafile format and yields package data.
- assembles this datafile package data in top-level packages and dependencies
- assigns package files to their package
"""
Expand All @@ -1033,6 +1034,16 @@ class DatafileHandler:
# can only contain ASCII letters, digits and underscore. Must be lowercase
datasource_id = None

# style of package data processed by this handler, either app for application package like npm,
# sys for system packages like rpm, or info for informational data file that provides hints but
# is not a package manifest, like with a README file
# possible values are app, sys and info
datasource_type = 'app'

# tuple of specifically supported operating systems. If None or empty, all platforms are supported
# possible values are win, mac, linux, freebsd
supported_oses = tuple()

# Sequence of known fnmatch-style case-insensitive glob patterns (e.g., Unix
# shell style patterns) that apply on the whole POSIX path for package
# datafiles recognized and parsed by this parser. See fnmatch.fnmatch().
Expand All @@ -1053,7 +1064,7 @@ class DatafileHandler:
# Informational: Default primary language for this parser.
default_primary_language = None

# If the datafilehandler contains only resolved dependencies
# If the handler is for a lockfile that contains locked/pinned, pre-resolved dependencies
is_lockfile = False

# Informational: Description of this parser
Expand All @@ -1062,7 +1073,9 @@ class DatafileHandler:
# Informational: URL that documents this file format
documentation_url = None

# Default Relation between license elements detected in an `extracted_license_statement`
# Default license expression relation between the license detected in an
# `extracted_license_statement` for this data file.
# This may vary for each data file based on conventions and specifications.
default_relation_license = None

@classmethod
Expand Down Expand Up @@ -1491,11 +1504,44 @@ def get_top_level_resources(cls, manifest_resource, codebase):
"""
pass

@classmethod
def validate(cls):
"""
Validate this class.
Raise ImproperlyConfiguredDatafileHandler exception on errors.
"""

did = cls.datasource_id
if not did:
raise ImproperlyConfiguredDatafileHandler(
f'The handler {cls!r} has an empty datasource_id {did!r}.')

DATASOURCE_TYPES = 'app', 'sys', 'info',
dfs = cls.datasource_type
if dfs not in DATASOURCE_TYPES:
raise ImproperlyConfiguredDatafileHandler(
f'The handler {did!r} : {cls!r} has an invalid '
f'datasource_type: {dfs!r}: must be one of {DATASOURCE_TYPES!r}.'
)

oses = 'linux', 'win', 'max', 'freebsd',
soses = cls.supported_oses
if soses and not all(s in oses for s in soses):
raise ImproperlyConfiguredDatafileHandler(
f'The handler {cls.datasource_id!r} : {cls!r} has invalid '
f'supported_oses: {soses!r}: must be empty or among {oses!r}'
)


class ImproperlyConfiguredDatafileHandler(Exception):
"""ScanCode Package Datafile Handler is not properly configured"""
pass


class NonAssemblableDatafileHandler(DatafileHandler):
"""
A handler that has no default implmentation for the assemble method, e.g.,
it will not alone trigger the creation of a top-level Pacakge.
A handler with a default implementation of an assemble method doing nothing, e.g.,
it will not alone trigger the creation of a top-level Package.
"""

@classmethod
Expand Down Expand Up @@ -1528,8 +1574,8 @@ def build_purl(mapping):
subpath = mapping.get('subpath')
return PackageURL(
type=ptype,
name=name,
namespace=namespace,
name=name,
version=version,
qualifiers=qualifiers,
subpath=subpath,
Expand Down Expand Up @@ -1601,10 +1647,10 @@ def from_package_data(cls, package_data, datafile_path, package_only=False):
license_match['from_file'] = datafile_path

package = cls.from_dict(package_data_mapping)

if not package.package_uid:
package.package_uid = build_package_uid(package.purl)

if not package_only:
package.populate_license_fields()
package.populate_holder_field()
Expand Down Expand Up @@ -1763,7 +1809,7 @@ def refresh_license_expressions(self, default_relation='AND'):
self.declared_license_expression_spdx = get_declared_license_expression_spdx(
declared_license_expression=self.declared_license_expression,
)

if self.other_license_detections:
self.other_license_expression = str(combine_expressions(
expressions=[
Expand Down
1 change: 1 addition & 0 deletions src/packagedcode/msi.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ class MsiInstallerHandler(models.DatafileHandler):
default_package_type = 'msi'
description = 'Microsoft MSI installer'
documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal'
supported_oses = ('linux',)

@classmethod
def parse(cls, location, package_only=False):
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1093,7 +1093,7 @@ def parse(cls, location, package_only=False):
package_only=package_only,
)


# FIXME: this is NOT used
class PypiSdistArchiveHandler(models.DatafileHandler):
datasource_id = 'pypi_sdist'
path_patterns = ('*.tar.gz', '*.tar.bz2', '*.zip',)
Expand Down
Loading

0 comments on commit b1cef69

Please sign in to comment.