diff --git a/src/packagedcode/about.py b/src/packagedcode/about.py index 3f33c6838db..5a1192fdd28 100644 --- a/src/packagedcode/about.py +++ b/src/packagedcode/about.py @@ -47,7 +47,7 @@ class AboutFileHandler(models.DatafileHandler): documentation_url = 'https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -90,7 +90,7 @@ def parse(cls, location): file_references.append(models.FileReference(path=about_resource)) # FIXME: we should put the unprocessed attributes in extra data - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=package_type, namespace=package_ns, @@ -103,6 +103,7 @@ def parse(cls, location): download_url=download_url, file_references=file_references, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assemble(cls, package_data, resource, codebase, package_adder): diff --git a/src/packagedcode/alpine.py b/src/packagedcode/alpine.py index 59223cf72da..1b155a90c97 100644 --- a/src/packagedcode/alpine.py +++ b/src/packagedcode/alpine.py @@ -63,11 +63,12 @@ class AlpineInstalledDatabaseHandler(models.DatafileHandler): description = 'Alpine Linux installed package database' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield from parse_alpine_installed_db( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @classmethod @@ -134,9 +135,14 @@ class AlpineApkbuildHandler(models.DatafileHandler): documentation_url = 'https://wiki.alpinelinux.org/wiki/APKBUILD_Reference' @classmethod - def parse(cls, location): - package_data = parse_apkbuild(location, strict=True) - cls.populate_license_fields(package_data) + def parse(cls, location, package_only=False): + package_data = parse_apkbuild( + location=location, + strict=True, + package_only=package_only + ) + if not package_only: + cls.populate_license_fields(package_data) if package_data: yield package_data @@ -165,7 +171,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) ) -def parse_alpine_installed_db(location, datasource_id, package_type): +def parse_alpine_installed_db(location, datasource_id, package_type, package_only=False): """ Yield PackageData objects from an installed database file at `location` or None. Typically found at '/lib/apk/db/installed' in an Alpine @@ -179,6 +185,7 @@ def parse_alpine_installed_db(location, datasource_id, package_type): package_fields=package_fields, datasource_id=datasource_id, package_type=package_type, + package_only=package_only, ) @@ -241,7 +248,7 @@ def get_alpine_installed_db_fields(location): ]) -def parse_apkbuild(location, strict=False): +def parse_apkbuild(location, strict=False, package_only=False): """ Return a PackageData object from an APKBUILD file at ``location`` or None. @@ -256,6 +263,7 @@ def parse_apkbuild(location, strict=False): datasource_id=AlpineApkbuildHandler.datasource_id, package_type=AlpineApkbuildHandler.default_package_type, strict=strict, + package_only=package_only, ) @@ -732,7 +740,7 @@ def fix_apkbuild(text): return text -def parse_apkbuild_text(text, datasource_id, package_type, strict=False): +def parse_apkbuild_text(text, datasource_id, package_type, strict=False, package_only=False): """ Return a PackageData object from an APKBUILD text context or None. Only consider variables with a name listed in the ``names`` set. @@ -761,7 +769,8 @@ def parse_apkbuild_text(text, datasource_id, package_type, strict=False): package = build_package_data( variables, datasource_id=datasource_id, - package_type=package_type + package_type=package_type, + package_only=package_only, ) if package and unresolved: @@ -800,7 +809,7 @@ def parse_pkginfo(location): raise NotImplementedError -def build_package_data(package_fields, datasource_id, package_type): +def build_package_data(package_fields, datasource_id, package_type, package_only=False): """ Return a PackageData object from a ``package_fields`` iterable of (name, value) tuples. @@ -850,7 +859,16 @@ def build_package_data(package_fields, datasource_id, package_type): converted_fields.update(converted) - return models.PackageData.from_dict(converted_fields) + fields_not_required = ["current_file", "current_dir"] + for field in fields_not_required: + value = converted_fields.get(field) + if value: + converted_fields.pop(field) + + return models.PackageData.from_data( + package_data=converted_fields, + package_only=package_only, + ) ##################################### # Note: all handlers MUST accept **kwargs as they also receive the current data diff --git a/src/packagedcode/build.py b/src/packagedcode/build.py index 7657f7c64b3..dc7a9cae742 100644 --- a/src/packagedcode/build.py +++ b/src/packagedcode/build.py @@ -55,7 +55,7 @@ class AutotoolsConfigureHandler(models.DatafileHandler): documentation_url = 'https://www.gnu.org/software/automake/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # we use the parent directory as a package name name = fileutils.file_name(fileutils.parent_directory(location)) # we could use checksums as version in the future @@ -67,12 +67,13 @@ def parse(cls, location): # there are dependencies we could use # dependencies = [] - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, version=version, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): @@ -112,6 +113,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): package = models.Package.from_package_data( package_data=package_data, datafile_path=resource.path, + package_only=True, ) if TRACE: @@ -143,8 +145,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): yield resource @classmethod - def parse(cls, location): - + def parse(cls, location, package_only=False): # Thanks to Starlark being a Python dialect, we can use `ast` to parse it with open(location, 'rb') as f: tree = ast.parse(f.read()) @@ -196,23 +197,28 @@ def parse(cls, location): if TRACE: logger_debug(f"build: parse: license_files: {license_files}") - package_data = models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, + extracted_license_statement=license_files, + ) + # `package_only` is True as we do the license detection + # on assembly + yield models.PackageData.from_data( + package_data=package_data, + package_only=True, ) - - package_data.extracted_license_statement = license_files - yield package_data else: # If we don't find anything in the pkgdata file, we yield a Package # with the parent directory as the name - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=fileutils.file_name(fileutils.parent_directory(location)) ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder, skip_name=None): @@ -334,7 +340,7 @@ class BuckMetadataBzlHandler(BaseStarlarkManifestHandler): documentation_url = 'https://buck.build/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=True): with open(location, 'rb') as f: tree = ast.parse(f.read()) @@ -386,7 +392,7 @@ def parse(cls, location): ): # TODO: Create function that determines package type from download URL, # then create a package of that package type from the metadata info - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=metadata_fields.get('upstream_type', cls.default_package_type), name=metadata_fields.get('name'), @@ -396,6 +402,7 @@ def parse(cls, location): homepage_url=metadata_fields.get('upstream_address', ''), # TODO: Store 'upstream_hash` somewhere ) + yield models.PackageData.from_data(package_data, package_only=True) if ( 'package_type' @@ -409,7 +416,7 @@ def parse(cls, location): and 'vcs_commit_hash' in metadata_fields ): - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=metadata_fields.get('package_type', cls.default_package_type), name=metadata_fields.get('name'), @@ -422,6 +429,7 @@ def parse(cls, location): sha1=metadata_fields.get('download_archive_sha1', ''), extra_data=dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', '')) ) + yield models.PackageData.from_data(package_data, package_only=True) @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/build_gradle.py b/src/packagedcode/build_gradle.py index 308abe5bc62..9c4882ccf04 100644 --- a/src/packagedcode/build_gradle.py +++ b/src/packagedcode/build_gradle.py @@ -59,9 +59,9 @@ class BuildGradleHandler(models.DatafileHandler): description = 'Gradle build script' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): dependencies = get_dependencies(location) - return build_package(cls, dependencies) + return build_package(cls, dependencies, package_only) # TODO: handle complex cases of nested builds with many packages @classmethod @@ -328,7 +328,7 @@ def get_dependencies(build_gradle_location): return list(get_dependencies_from_parse_tree(parse_tree)) -def build_package(cls, dependencies): +def build_package(cls, dependencies, package_only=False): """ Yield PackageData from a ``dependencies`` list of mappings. """ @@ -364,10 +364,11 @@ def build_package(cls, dependencies): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=BuildGradleHandler.default_primary_language, dependencies=package_dependencies, ) + yield models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/cargo.py b/src/packagedcode/cargo.py index ffe8dea5159..ca46e388383 100644 --- a/src/packagedcode/cargo.py +++ b/src/packagedcode/cargo.py @@ -29,7 +29,7 @@ class CargoTomlHandler(models.DatafileHandler): documentation_url = 'https://doc.rust-lang.org/cargo/reference/manifest.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): package_data = toml.load(location, _dict=dict) core_package_data = package_data.get('package', {}) @@ -67,7 +67,7 @@ def parse(cls, location): repository_download_url = name and version and f'https://crates.io/api/v1/crates/{name}/{version}/download' api_data_url = name and f'https://crates.io/api/v1/crates/{name}' - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -83,6 +83,7 @@ def parse(cls, location): api_data_url=api_data_url, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assemble(cls, package_data, resource, codebase, package_adder): @@ -116,7 +117,7 @@ class CargoLockHandler(models.DatafileHandler): # ] @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): cargo_lock = toml.load(location, _dict=dict) dependencies = [] package = cargo_lock.get('package', []) @@ -137,12 +138,13 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assemble(cls, package_data, resource, codebase, package_adder): diff --git a/src/packagedcode/chef.py b/src/packagedcode/chef.py index 5c378cdce4e..d5df6a2f6c4 100644 --- a/src/packagedcode/chef.py +++ b/src/packagedcode/chef.py @@ -183,14 +183,18 @@ def is_datafile(cls, location, filetypes=tuple()): return not parent.endswith('dist-info') @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. """ with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) - yield build_package(package_data, datasource_id=cls.datasource_id) + yield build_package( + package_data=package_data, + datasource_id=cls.datasource_id, + package_only=package_only, + ) class ChefMetadataRbHandler(BaseChefMetadataHandler): @@ -202,7 +206,7 @@ class ChefMetadataRbHandler(BaseChefMetadataHandler): documentation_url = 'https://docs.chef.io/config_rb_metadata/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding='utf-8') as loc: file_contents = loc.read() @@ -213,10 +217,14 @@ def parse(cls, location): ChefMetadataFormatter() ) package_data = json.loads(formatted_file_contents) - yield build_package(package_data, datasource_id=cls.datasource_id) + yield build_package( + package_data=package_data, + datasource_id=cls.datasource_id, + package_only=package_only, + ) -def build_package(package_data, datasource_id): +def build_package(package_data, datasource_id, package_only=False): """ Return a PackageData object from a package_data mapping from a metadata.json or similar or None. @@ -261,7 +269,7 @@ def build_package(package_data, datasource_id): ) ) - return models.PackageData( + package_data = dict( datasource_id=datasource_id, type=ChefMetadataJsonHandler.default_package_type, name=name, @@ -275,3 +283,4 @@ def build_package(package_data, datasource_id): primary_language='Ruby', **get_urls(name, version), ) + return models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/cocoapods.py b/src/packagedcode/cocoapods.py index 9773ae13638..8ca3617800b 100644 --- a/src/packagedcode/cocoapods.py +++ b/src/packagedcode/cocoapods.py @@ -216,7 +216,7 @@ class PodspecHandler(BasePodHandler): documentation_url = 'https://guides.cocoapods.org/syntax/podspec.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -258,7 +258,7 @@ def parse(cls, location): homepage_url=homepage_url, vcs_url=vcs_url) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -273,6 +273,7 @@ def parse(cls, location): parties=parties, **urls, ) + yield models.PackageData.from_data(package_data, package_only) class PodfileHandler(PodspecHandler): @@ -293,7 +294,7 @@ class PodfileLockHandler(BasePodHandler): documentation_url = 'https://guides.cocoapods.org/using/the-podfile.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield PackageData from a YAML Podfile.lock. """ @@ -337,12 +338,13 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) class PodspecJsonHandler(models.DatafileHandler): @@ -354,7 +356,7 @@ class PodspecJsonHandler(models.DatafileHandler): documentation_url = 'https://guides.cocoapods.org/syntax/podspec.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location) as psj: data = json.load(psj) @@ -423,7 +425,7 @@ def parse(cls, location): name=name, version=version, homepage_url=homepage_url, vcs_url=vcs_url) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, primary_language=cls.default_primary_language, type=cls.default_package_type, @@ -437,6 +439,7 @@ def parse(cls, location): download_url=download_url, **urls, ) + yield models.PackageData.from_data(package_data, package_only) def get_urls(name=None, version=None, homepage_url=None, vcs_url=None, **kwargs): diff --git a/src/packagedcode/conan.py b/src/packagedcode/conan.py index 7d4b637f9bc..44321c581a7 100644 --- a/src/packagedcode/conan.py +++ b/src/packagedcode/conan.py @@ -123,7 +123,7 @@ class ConanFileHandler(models.DatafileHandler): documentation_url = "https://docs.conan.io/2.0/reference/conanfile.html" @classmethod - def _parse(cls, conan_recipe): + def _parse(cls, conan_recipe, package_only=False): try: tree = ast.parse(conan_recipe) recipe_class_def = next( @@ -150,7 +150,7 @@ def _parse(cls, conan_recipe): dependencies = get_dependencies(parser.requires) - return models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -163,13 +163,14 @@ def _parse(cls, conan_recipe): extracted_license_statement=parser.license, dependencies=dependencies, ) + return models.PackageData.from_data(package_data, package_only) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding="utf-8") as loc: conan_recipe = loc.read() - yield cls._parse(conan_recipe) + yield cls._parse(conan_recipe, package_only) class ConanDataHandler(models.DatafileHandler): @@ -184,7 +185,7 @@ class ConanDataHandler(models.DatafileHandler): ) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding="utf-8") as loc: conan_data = loc.read() @@ -203,7 +204,7 @@ def parse(cls, location): elif isinstance(source_urls, list): url = source_urls[0] - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -212,6 +213,8 @@ def parse(cls, location): download_url=url, sha256=sha256, ) + yield models.PackageData.from_data(package_data, package_only) + @classmethod def assemble( @@ -245,7 +248,7 @@ def assemble( ] = conanfile_package_data.get("extracted_license_statement") datafile_path = resource.path - pkg_data = models.PackageData.from_dict(package_data_dict) + pkg_data = models.PackageData.from_data(package_data_dict) if pkg_data.purl: package = models.Package.from_package_data( diff --git a/src/packagedcode/conda.py b/src/packagedcode/conda.py index 545b3a1fb53..225940d69fc 100644 --- a/src/packagedcode/conda.py +++ b/src/packagedcode/conda.py @@ -79,7 +79,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) ) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): metayaml = get_meta_yaml_data(location) package_element = metayaml.get('package') or {} package_name = package_element.get('name') @@ -118,7 +118,7 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=package_name, @@ -131,6 +131,7 @@ def parse(cls, location): extracted_license_statement=extracted_license_statement, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) def get_meta_yaml_data(location): diff --git a/src/packagedcode/cran.py b/src/packagedcode/cran.py index ef274105da6..827ff38f9d3 100644 --- a/src/packagedcode/cran.py +++ b/src/packagedcode/cran.py @@ -30,7 +30,7 @@ class CranDescriptionFileHandler(models.DatafileHandler): documentation_url = 'https://r-pkgs.org/description.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): cran_desc = get_cran_description(location) name = cran_desc.get('Package') @@ -93,7 +93,7 @@ def parse(cls, location): # TODO: Let's handle the release date as a Date type # release_date = cran_desc.get('Date/Publication'), - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -105,6 +105,7 @@ def parse(cls, location): dependencies=package_dependencies, repository_homepage_url=f'https://cran.r-project.org/package={name}', ) + yield models.PackageData.from_data(package_data, package_only) # FIXME: THIS IS NOT YAML but RFC 822 diff --git a/src/packagedcode/debian.py b/src/packagedcode/debian.py index 860144b9723..6b45dc15a92 100644 --- a/src/packagedcode/debian.py +++ b/src/packagedcode/debian.py @@ -60,7 +60,7 @@ class DebianDebPackageHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield build_package_data_from_package_filename( filename=fileutils.file_name(location), datasource_id=cls.datasource_id, @@ -83,7 +83,7 @@ class DebianSourcePackageMetadataTarballHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # strip extension filename, _, _ = location.rpartition('.tar') yield build_package_data_from_package_filename( @@ -108,7 +108,7 @@ class DebianSourcePackageTarballHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # strip extension filename, _, _ = location.rpartition('.tar') yield build_package_data_from_package_filename( @@ -132,13 +132,14 @@ class DebianControlFileInExtractedDebHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # TODO: we cannot know the distro from the name only yield build_package_data( debian_data=get_paragraph_data_from_file(location=location), datasource_id=cls.datasource_id, package_type=cls.default_package_type, distro='debian', + package_only=package_only, ) @classmethod @@ -158,7 +159,7 @@ class DebianControlFileInSourceHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # NOTE: a control file in a source repo or debina.tar tarball can contain more than one package debian_packages = [] for debian_data in get_paragraphs_data_from_file(location=location): @@ -167,6 +168,7 @@ def parse(cls, location): debian_data=debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) ) @@ -191,7 +193,7 @@ class DebianDscFileHandler(models.DatafileHandler): documentation_url = 'https://wiki.debian.org/dsc' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # this is typically signed debian_data = get_paragraph_data_from_file( location=location, @@ -207,6 +209,7 @@ def parse(cls, location): debian_data=debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) package_data.update_purl_fields(package_data=package_data_from_file) yield package_data @@ -225,7 +228,7 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # note that we do not know yet the distro at this stage # we could get it... but we get that later during assemble() debian_packages = [] @@ -235,6 +238,7 @@ def parse(cls, location): debian_data=debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) ) @@ -392,7 +396,7 @@ class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield installed PackageData objects given a ``location`` var/lib/dpkg/status.d/ file as found in a distroless container @@ -406,6 +410,7 @@ def parse(cls, location): debian_data=debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) ) @@ -475,7 +480,7 @@ class DebianInstalledFilelistHandler(models.DatafileHandler): description = 'Debian installed file paths list' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, @@ -501,7 +506,7 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/manuals/debian-handbook/sect.package-meta-information.en.html#sect.configuration-scripts' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, @@ -526,7 +531,7 @@ class DebianMd5sumFilelistInPackageHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/manuals/debian-handbook/sect.package-meta-information.en.html#sect.configuration-scripts' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, @@ -561,7 +566,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty if isinstance(version, DebVersion): version = str(version) - return models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, name=deb.name, @@ -569,6 +574,7 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty version=version, qualifiers=qualifiers, ) + return models.PackageData.from_data(package_data) def parse_debian_files_list(location, datasource_id, package_type): @@ -606,16 +612,17 @@ def parse_debian_files_list(location, datasource_id, package_type): if not file_references: return - yield models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, name=name, qualifiers=qualifiers, file_references=file_references, ) + yield models.PackageData.from_data(package_data) -def build_package_data(debian_data, datasource_id, package_type='deb', distro=None): +def build_package_data(debian_data, datasource_id, package_type='deb', distro=None, package_only=False): """ Return a PackageData object from a package_data mapping (from a dpkg status or similar file) or None. @@ -691,7 +698,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No source_packages.append(source_pkg_purl) - return models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, namespace=distro, @@ -706,6 +713,7 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No parties=parties, extra_data=extra_data, ) + return models.PackageData.from_data(package_data, package_only) def populate_debian_namespace(packages): diff --git a/src/packagedcode/debian_copyright.py b/src/packagedcode/debian_copyright.py index 510abf620d6..7f08592fe8d 100644 --- a/src/packagedcode/debian_copyright.py +++ b/src/packagedcode/debian_copyright.py @@ -95,7 +95,7 @@ def is_datafile(cls, location, filetypes=tuple(), strict=False): return True @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): debian_copyright = parse_copyright_file(location) license_fields = DebianLicenseFields.get_license_fields( debian_copyright=debian_copyright @@ -111,20 +111,27 @@ def parse(cls, location): # no name otherwise for now name = None - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, - extracted_license_statement=license_fields.extracted_license_statement, - declared_license_expression=license_fields.declared_license_expression, - declared_license_expression_spdx=license_fields.declared_license_expression_spdx, - license_detections=license_fields.license_detections, - other_license_expression=license_fields.other_license_expression, - other_license_expression_spdx=license_fields.other_license_expression_spdx, - other_license_detections=license_fields.other_license_detections, - copyright=debian_copyright.get_copyright(), ) + if not package_only: + license_data = dict( + extracted_license_statement=license_fields.extracted_license_statement, + declared_license_expression=license_fields.declared_license_expression, + declared_license_expression_spdx=license_fields.declared_license_expression_spdx, + license_detections=license_fields.license_detections, + other_license_expression=license_fields.other_license_expression, + other_license_expression_spdx=license_fields.other_license_expression_spdx, + other_license_detections=license_fields.other_license_detections, + copyright=debian_copyright.get_copyright(), + ) + package_data.update(license_data) + + yield models.PackageData.from_data(package_data, package_only) + @attr.s class DebianLicenseFields: @@ -279,14 +286,14 @@ def assemble(cls, package_data, resource, codebase, package_adder): yield from super().assemble(package_data, resource, codebase, package_adder) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Gets license/copyright information from file like other copyright files, but also gets purl fields if present in copyright filename, if obtained from upstream metadata archive. """ - package_data = list(super().parse(location)).pop() + package_data = list(super().parse(location, package_only)).pop() package_data_from_file = build_package_data_from_metadata_filename( filename=os.path.basename(location), datasource_id=cls.datasource_id, diff --git a/src/packagedcode/distro.py b/src/packagedcode/distro.py index 6336e6b3111..7e572948a03 100644 --- a/src/packagedcode/distro.py +++ b/src/packagedcode/distro.py @@ -25,7 +25,7 @@ class EtcOsReleaseHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://www.freedesktop.org/software/systemd/man/os-release.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): distro = Distro.from_os_release_file(location) distro_identifier = distro.identifier pretty_name = distro.pretty_name and distro.pretty_name.lower() or '' @@ -53,13 +53,14 @@ def parse(cls, location): version = distro.version_id - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, namespace=namespace, name=name, version=version, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def find_linux_rootfs_root_resource(cls, resource, codebase): diff --git a/src/packagedcode/freebsd.py b/src/packagedcode/freebsd.py index c1f5342b562..d170e8f8222 100644 --- a/src/packagedcode/freebsd.py +++ b/src/packagedcode/freebsd.py @@ -52,7 +52,7 @@ class CompactManifestHandler(models.DatafileHandler): documentation_url = 'https://www.freebsd.org/cgi/man.cgi?pkg-create(8)#MANIFEST_FILE_DETAILS' @classmethod - def _parse(cls, yaml_data): + def _parse(cls, yaml_data, package_only=False): package_data = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, @@ -97,7 +97,8 @@ def _parse(cls, yaml_data): # license_mapper needs multiple fields license_mapper(yaml_data, package_data) - cls.populate_license_fields(package_data) + if not package_only: + cls.populate_license_fields(package_data) if TRACE: logger_debug( @@ -107,7 +108,7 @@ def _parse(cls, yaml_data): return package_data @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -115,7 +116,7 @@ def parse(cls, location): with io.open(location, encoding='utf-8') as loc: yaml_data = saneyaml.load(loc) - yield cls._parse(yaml_data) + yield cls._parse(yaml_data, package_only) @staticmethod def get_license_detections_and_expression(package_data): diff --git a/src/packagedcode/godeps.py b/src/packagedcode/godeps.py index 68ff54787ff..c9d15a3934a 100644 --- a/src/packagedcode/godeps.py +++ b/src/packagedcode/godeps.py @@ -38,7 +38,7 @@ class GodepsHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://github.com/tools/godep' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): godeps = Godep(location) if godeps.import_path: @@ -64,7 +64,7 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, namespace=namespace, @@ -72,6 +72,7 @@ def parse(cls, location): primary_language=cls.default_primary_language, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/golang.py b/src/packagedcode/golang.py index 6075c713e86..68651fa6cb1 100644 --- a/src/packagedcode/golang.py +++ b/src/packagedcode/golang.py @@ -49,7 +49,7 @@ class GoModHandler(BaseGoModuleHandler): documentation_url = 'https://go.dev/ref/mod' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): gomods = go_mod.parse_gomod(location) dependencies = [] @@ -89,7 +89,7 @@ def parse(cls, location): if namespace and name: repository_homepage_url = f'https://pkg.go.dev/{namespace}/{name}' - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -100,6 +100,7 @@ def parse(cls, location): dependencies=dependencies, primary_language=cls.default_primary_language, ) + yield models.PackageData.from_data(package_data, package_only) class GoSumHandler(BaseGoModuleHandler): @@ -111,7 +112,7 @@ class GoSumHandler(BaseGoModuleHandler): documentation_url = 'https://go.dev/ref/mod#go-sum-files' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): gosums = go_mod.parse_gosum(location) package_dependencies = [] for gosum in gosums: @@ -126,9 +127,10 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, dependencies=package_dependencies, primary_language=cls.default_primary_language, ) + yield models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/haxe.py b/src/packagedcode/haxe.py index 99f5be4ef7d..630d4e36175 100644 --- a/src/packagedcode/haxe.py +++ b/src/packagedcode/haxe.py @@ -45,11 +45,11 @@ class HaxelibJsonHandler(models.DatafileHandler): documentation_url = 'https://lib.haxe.org/documentation/creating-a-haxelib-package/' @classmethod - def _parse(cls, json_data): + def _parse(cls, json_data, package_only=False): name = json_data.get('name') version = json_data.get('version') - package_data = models.PackageData( + package_mapping = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -60,6 +60,7 @@ def _parse(cls, json_data): description=json_data.get('description'), primary_language=cls.default_primary_language, ) + package_data = models.PackageData.from_data(package_mapping, package_only) if name and version: download_url = f'https://lib.haxe.org/p/{name}/{version}/download/' @@ -91,7 +92,7 @@ def _parse(cls, json_data): return package_data @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package_data archive, manifest or similar. @@ -111,4 +112,4 @@ def parse(cls, location): with io.open(location, encoding='utf-8') as loc: json_data = json.load(loc) - yield cls._parse(json_data) + yield cls._parse(json_data, package_only) diff --git a/src/packagedcode/maven.py b/src/packagedcode/maven.py index b5d521dc44a..1498add8cf0 100644 --- a/src/packagedcode/maven.py +++ b/src/packagedcode/maven.py @@ -132,13 +132,14 @@ class JavaJarManifestHandler(MavenBasePackageHandler): documentation_url = 'https://docs.oracle.com/javase/tutorial/deployment/jar/manifestindex.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): sections = parse_manifest(location) if sections: main_section = sections[0] manifest = get_normalized_java_manifest_data(main_section) if manifest: - yield models.PackageData(**manifest,) + package_data = dict(**manifest,) + yield models.PackageData.from_data(package_data, package_only) class JavaJarManifestHandlerMixin(models.DatafileHandler): @@ -206,13 +207,14 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location, base_url='https://repo1.maven.org/maven2'): + def parse(cls, location, package_only=False, base_url='https://repo1.maven.org/maven2'): package_data = parse( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, primary_language=cls.default_primary_language, base_url=base_url, + package_only=package_only, ) if package_data: yield package_data @@ -303,7 +305,7 @@ class MavenPomPropertiesHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://maven.apache.org/pom.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield PackageData from a pom.properties file (which is typically side- by-side with its pom file.) @@ -313,10 +315,10 @@ def parse(cls, location): if TRACE: logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}') if properties: - yield from cls.parse_pom_properties(properties=properties) + yield from cls.parse_pom_properties(properties=properties, package_only=package_only) @classmethod - def parse_pom_properties(cls, properties): + def parse_pom_properties(cls, properties, package_only=False): namespace = properties.pop("groupId", None) name = properties.pop("artifactId", None) version = properties.pop("version", None) @@ -325,7 +327,7 @@ def parse_pom_properties(cls, properties): else: extra_data = {} - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -334,6 +336,7 @@ def parse_pom_properties(cls, properties): version=version, extra_data=extra_data, ) + yield models.PackageData.from_data(package_data, package_only) def build_url( @@ -1189,6 +1192,7 @@ def parse( package_type, primary_language, base_url='https://repo1.maven.org/maven2', + package_only=False, ): """ Return Packagedata objects from parsing a Maven pom file at `location` or @@ -1199,7 +1203,8 @@ def parse( package_type=package_type, primary_language=primary_language, location=location, - base_url=base_url + base_url=base_url, + package_only=package_only, ) if package: return package @@ -1212,6 +1217,7 @@ def _parse( location=None, text=None, base_url='https://repo1.maven.org/maven2', + package_only=False, ): """ Yield Packagedata objects from parsing a Maven pom file at `location` or @@ -1283,7 +1289,7 @@ def _parse( )) # FIXME: there are still other data to map in a PackageData - return MavenPackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, primary_language=primary_language, @@ -1300,6 +1306,7 @@ def _parse( bug_tracking_url=bug_tracking_url, **urls, ) + return MavenPackageData.from_data(package_data, package_only) class MavenPackageData(models.PackageData): diff --git a/src/packagedcode/models.py b/src/packagedcode/models.py index a30a79af3c1..1b65497e590 100644 --- a/src/packagedcode/models.py +++ b/src/packagedcode/models.py @@ -728,6 +728,9 @@ def from_data(cls, package_data, package_only=False): Skip the license/copyright detection step if `package_only` is True. """ + if "purl" in package_data: + package_data.pop("purl") + package_data = cls(**package_data) if not package_only: @@ -1048,13 +1051,16 @@ def is_datafile(cls, location, filetypes=tuple(), _bare_filename=False): return any(ft in actual_type for ft in filetypes) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more PackageData objects given a package data file at ``location``. Subclasses must implement and are responsible for returning proper computed license fields and list of resources and files. + + If `package_only`, skip the license/copyright detection on extracted + license/copyright data. """ raise NotImplementedError @@ -1515,8 +1521,6 @@ def __attrs_post_init__(self, *args, **kwargs): if not self.package_uid: self.package_uid = build_package_uid(self.purl) - self.populate_license_fields() - def to_dict(self): return super().to_dict(with_details=False) @@ -1528,7 +1532,7 @@ def to_package_data(self): return PackageData.from_dict(mapping) @classmethod - def from_package_data(cls, package_data, datafile_path): + def from_package_data(cls, package_data, datafile_path, package_only=False): """ Return a Package from a ``package_data`` PackageData object or mapping. Or None. @@ -1552,7 +1556,16 @@ def from_package_data(cls, package_data, datafile_path): if not license_match['from_file']: license_match['from_file'] = datafile_path - return cls.from_dict(package_data_mapping) + package = cls.from_dict(package_data_mapping) + + if not package.package_uid: + package.package_uid = build_package_uid(package.purl) + + if not package_only: + package.populate_license_fields() + package.populate_holder_field() + + return package @classmethod def from_dict(cls, mapping): diff --git a/src/packagedcode/msi.py b/src/packagedcode/msi.py index 31deea3040b..5ce1ba534ed 100644 --- a/src/packagedcode/msi.py +++ b/src/packagedcode/msi.py @@ -124,6 +124,7 @@ def create_package_data_from_msiinfo_results( msiinfo_results, datasource_id='msi_installer', package_type='msi', + package_only=False, ): """ Return PackageData from a mapping of `msiinfo_results` @@ -150,7 +151,7 @@ def create_package_data_from_msiinfo_results( description = msiinfo_results.pop('Comments', '') keywords = msiinfo_results.pop('Keywords', []) - return models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, name=name, @@ -160,11 +161,14 @@ def create_package_data_from_msiinfo_results( keywords=keywords, extra_data=msiinfo_results ) + return models.PackageData.from_data(package_data, package_only) -def msi_parse(location, +def msi_parse( + location, datasource_id='msi_installer', package_type='msi', + package_only=False, ): """ Return PackageData from ``location`` @@ -175,6 +179,7 @@ def msi_parse(location, msiinfo_results=info, datasource_id=datasource_id, package_type=package_type, + package_only=package_only, ) else: return models.PackageData( @@ -192,5 +197,5 @@ class MsiInstallerHandler(models.DatafileHandler): documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal' @classmethod - def parse(cls, location): - yield msi_parse(location) + def parse(cls, location, package_only=False): + yield msi_parse(location, package_only) diff --git a/src/packagedcode/npm.py b/src/packagedcode/npm.py index c831aa8e855..18ad27d7564 100644 --- a/src/packagedcode/npm.py +++ b/src/packagedcode/npm.py @@ -185,7 +185,7 @@ class NpmPackageJsonHandler(BaseNpmHandler): documentation_url = 'https://docs.npmjs.com/cli/v8/configuring-npm/package-json' @classmethod - def _parse(cls, json_data): + def _parse(cls, json_data, package_only=False): name = json_data.get('name') version = json_data.get('version') homepage_url = json_data.get('homepage', '') @@ -200,7 +200,7 @@ def _parse(cls, json_data): namespace, name = split_scoped_package_name(name) urls = get_urls(namespace, name, version) - package = models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -211,6 +211,7 @@ def _parse(cls, json_data): homepage_url=homepage_url, **urls, ) + package = models.PackageData.from_data(package_data, package_only) vcs_revision = json_data.get('gitHead') or None # mapping of top level package.json items to a function accepting as @@ -249,7 +250,8 @@ def _parse(cls, json_data): lics = json_data.get('licenses') package = licenses_mapper(lic, lics, package) - package.populate_license_fields() + if not package_only: + package.populate_license_fields() if TRACE: logger_debug(f'NpmPackageJsonHandler: parse: package: {package.to_dict()}') @@ -257,17 +259,17 @@ def _parse(cls, json_data): return package @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding='utf-8') as loc: json_data = json.load(loc) - yield cls._parse(json_data) + yield cls._parse(json_data, package_only) class BaseNpmLockHandler(BaseNpmHandler): @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) @@ -280,7 +282,7 @@ def parse(cls, location): extra_data = dict(lockfile_version=lockfile_version) # this is the top level element that we return - root_package_data = models.PackageData( + root_package_mapping = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -290,6 +292,7 @@ def parse(cls, location): extra_data=extra_data, **get_urls(root_ns, root_name, root_version) ) + root_package_data = models.PackageData.from_data(root_package_mapping, package_only) # https://docs.npmjs.com/cli/v8/configuring-npm/package-lock-json#lockfileversion if lockfile_version == 1: @@ -359,7 +362,7 @@ def parse(cls, location): integrity = dep_data.get('integrity') misc.update(get_algo_hexsum(integrity).items()) - resolved_package = models.PackageData( + resolved_package_mapping = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -369,6 +372,7 @@ def parse(cls, location): extracted_license_statement=extracted_license_statement, **misc, ) + resolved_package = models.PackageData.from_data(resolved_package_mapping, package_only) # these are paths t the root of the installed package in v2 if dep: resolved_package.file_references = [models.FileReference(path=dep)], @@ -490,7 +494,7 @@ def is_datafile(cls, location, filetypes=tuple()): return super().is_datafile(location, filetypes=filetypes) and is_yarn_v2(location) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Parse a bew yarn.lock v2 YAML format which looks like this: @@ -545,12 +549,13 @@ def parse(cls, location): ) top_dependencies.append(dependency) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=top_dependencies, ) + yield models.PackageData.from_data(package_data, package_only) class YarnLockV1Handler(BaseNpmHandler): @@ -569,7 +574,7 @@ def is_datafile(cls, location, filetypes=tuple()): return super().is_datafile(location, filetypes=filetypes) and not is_yarn_v2(location) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Parse a classic yarn.lock format which looks like this: "@babel/core@^7.1.0", "@babel/core@^7.3.4": @@ -657,7 +662,7 @@ def parse(cls, location): misc.update(get_algo_hexsum(integrity).items()) # we create a resolve package with the details - resolved_package_data = models.PackageData( + resolved_package_mapping = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, namespace=ns, @@ -666,6 +671,7 @@ def parse(cls, location): primary_language=cls.default_primary_language, **misc, ) + resolved_package_data = models.PackageData.from_data(resolved_package_mapping, package_only) # we add the sub-deps to the resolved package for subns, subname, subconstraint in sub_dependencies: @@ -701,12 +707,13 @@ def parse(cls, location): ) dependencies.append(dep) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) def get_checksum_and_url(url): diff --git a/src/packagedcode/nuget.py b/src/packagedcode/nuget.py index dee1afb39d5..de584038d48 100644 --- a/src/packagedcode/nuget.py +++ b/src/packagedcode/nuget.py @@ -106,7 +106,7 @@ class NugetNuspecHandler(models.DatafileHandler): documentation_url = 'https://docs.microsoft.com/en-us/nuget/reference/nuspec' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location, 'rb') as loc: parsed = xmltodict.parse(loc) @@ -163,7 +163,7 @@ def parse(cls, location): elif 'licenseUrl' in nuspec: extracted_license_statement = nuspec.get('licenseUrl') - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -177,4 +177,5 @@ def parse(cls, location): vcs_url=vcs_url, **urls, ) + yield models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/opam.py b/src/packagedcode/opam.py index 94e2f4942d7..f2fde2e017e 100644 --- a/src/packagedcode/opam.py +++ b/src/packagedcode/opam.py @@ -31,7 +31,7 @@ def get_package_root(cls, resource, codebase): return resource.parent(codebase) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): opams = parse_opam(location) package_dependencies = [] @@ -90,7 +90,7 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -111,6 +111,7 @@ def parse(cls, location): repository_homepage_url=repository_homepage_url, primary_language=cls.default_primary_language ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/phpcomposer.py b/src/packagedcode/phpcomposer.py index b729e0bfc25..d217e5283d4 100644 --- a/src/packagedcode/phpcomposer.py +++ b/src/packagedcode/phpcomposer.py @@ -58,7 +58,7 @@ class PhpComposerJsonHandler(BasePhpComposerHandler): documentation_url = 'https://getcomposer.org/doc/04-schema.md' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -69,7 +69,7 @@ def parse(cls, location): with io.open(location, encoding='utf-8') as loc: package_json = json.load(loc) - yield build_package_data(package_json) + yield build_package_data(package_json, package_only) def get_repository_homepage_url(namespace, name): @@ -86,7 +86,7 @@ def get_api_data_url(namespace, name): return f'https://packagist.org/p/packages/{name}.json' -def build_package_data(package_data): +def build_package_data(package_data, package_only=False): # Note: A composer.json without name and description is not a usable PHP # composer package. Name and description fields are required but only for @@ -103,7 +103,7 @@ def build_package_data(package_data): else: ns, _, name = ns_name.rpartition('/') - package = models.PackageData( + package_mapping = dict( datasource_id=PhpComposerJsonHandler.datasource_id, type=PhpComposerJsonHandler.default_package_type, namespace=ns, @@ -112,6 +112,7 @@ def build_package_data(package_data): api_data_url=get_api_data_url(ns, name), primary_language=PhpComposerJsonHandler.default_primary_language, ) + package = models.PackageData.from_data(package_mapping, package_only) # mapping of top level composer.json items to the Package object field name plain_fields = [ @@ -157,7 +158,9 @@ def build_package_data(package_data): vendor_mapper(package) # Per https://getcomposer.org/doc/04-schema.md#license this is an expression - package.populate_license_fields() + if not package_only: + package.populate_license_fields() + return package @@ -170,16 +173,16 @@ class PhpComposerLockHandler(BasePhpComposerHandler): documentation_url = 'https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) packages = [ - build_package_data(p) + build_package_data(p, package_only) for p in package_data.get('packages', []) ] packages_dev = [ - build_package_data(p) + build_package_data(p, package_only) for p in package_data.get('packages-dev', []) ] @@ -192,12 +195,13 @@ def parse(cls, location): for p in packages_dev ] - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=required_deps + required_dev_deps ) + yield models.PackageData.from_data(package_data, package_only) for package in packages + packages_dev: yield package diff --git a/src/packagedcode/pubspec.py b/src/packagedcode/pubspec.py index a6abd5b8642..df18d409ba4 100644 --- a/src/packagedcode/pubspec.py +++ b/src/packagedcode/pubspec.py @@ -60,11 +60,11 @@ class DartPubspecYamlHandler(BaseDartPubspecHandler): documentation_url = 'https://dart.dev/tools/pub/pubspec' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location) as inp: pubspec_data = saneyaml.load(inp.read()) - package_data = build_package(pubspec_data) + package_data = build_package(pubspec_data, package_only) if package_data: yield package_data @@ -78,18 +78,19 @@ class DartPubspecLockHandler(BaseDartPubspecHandler): documentation_url = 'https://web.archive.org/web/20220330081004/https://gpalma.pt/blog/what-is-the-pubspec-lock/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location) as inp: locks_data = saneyaml.load(inp.read()) dependencies = list(collect_locks(locks_data)) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies ) + yield models.PackageData.from_data(package_data, package_only) def collect_locks(locks_data): @@ -238,7 +239,7 @@ def build_dep(name, version, scope, is_runtime=True, is_optional=False): return dep -def build_package(pubspec_data): +def build_package(pubspec_data, package_only=False): """ Return a package object from a package data mapping or None """ @@ -315,7 +316,7 @@ def add_to_extra_if_present(_key): add_to_extra_if_present('executables') add_to_extra_if_present('publish_to') - return models.PackageData( + package_data = dict( datasource_id=DartPubspecYamlHandler.datasource_id, type=DartPubspecYamlHandler.default_primary_language, primary_language=DartPubspecYamlHandler.default_primary_language, @@ -333,3 +334,4 @@ def add_to_extra_if_present(_key): api_data_url=api_data_url, repository_download_url=repository_download_url, ) + return models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/pypi.py b/src/packagedcode/pypi.py index d90dacbabef..33f2ec394ea 100644 --- a/src/packagedcode/pypi.py +++ b/src/packagedcode/pypi.py @@ -79,11 +79,12 @@ class PythonEggPkgInfoFile(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0376/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @classmethod @@ -103,11 +104,12 @@ class PythonEditableInstallationPkgInfoFile(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0376/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @classmethod @@ -320,11 +322,12 @@ def is_datafile(cls, location): ) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @@ -337,11 +340,12 @@ class PythonInstalledWheelMetadataFile(models.DatafileHandler): documentation_url = 'https://packaging.python.org/en/latest/specifications/core-metadata/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @classmethod @@ -457,7 +461,7 @@ class PyprojectTomlHandler(models.NonAssemblableDatafileHandler): META_DIR_SUFFIXES = '.dist-info', '.egg-info', 'EGG-INFO', -def parse_metadata(location, datasource_id, package_type): +def parse_metadata(location, datasource_id, package_type, package_only=False): """ Return a PackageData object from a PKG-INFO or METADATA file at ``location`` which is a path string or pathlib.Path-like object (including a possible zip @@ -487,7 +491,7 @@ def parse_metadata(location, datasource_id, package_type): file_references = list(get_file_references(dist)) - return models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, primary_language='Python', @@ -502,6 +506,7 @@ def parse_metadata(location, datasource_id, package_type): extra_data=extra_data, **urls, ) + return models.PackageData.from_data(package_data, package_only) def urlsafe_b64decode(data): @@ -551,7 +556,7 @@ class PypiWheelHandler(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0427/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with zipfile.ZipFile(location) as zf: for path in ZipPath(zf).iterdir(): if not path.name.endswith(META_DIR_SUFFIXES): @@ -564,6 +569,7 @@ def parse(cls, location): location=metapath, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @@ -577,7 +583,7 @@ class PypiEggHandler(models.DatafileHandler): documentation_url = 'https://web.archive.org/web/20210604075235/http://peak.telecommunity.com/DevCenter/PythonEggs' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with zipfile.ZipFile(location) as zf: for path in ZipPath(zf).iterdir(): if not path.name.endswith(META_DIR_SUFFIXES): @@ -591,6 +597,7 @@ def parse(cls, location): location=metapath, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @@ -610,7 +617,7 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # FIXME: add dependencies try: @@ -622,7 +629,7 @@ def parse(cls, location): version = sdist.version urls, extra_data = get_urls(metainfo=sdist, name=name, version=version) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -635,6 +642,7 @@ def parse(cls, location): extra_data=extra_data, **urls, ) + yield models.PackageData.from_data(package_data, package_only) class PythonSetupPyHandler(BaseExtractedPythonLayout): @@ -646,7 +654,7 @@ class PythonSetupPyHandler(BaseExtractedPythonLayout): documentation_url = 'https://docs.python.org/3.11/distutils/setupscript.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): setup_args = get_setup_py_args(location) # it may be legit to have a name-less package? @@ -664,7 +672,7 @@ def parse(cls, location): python_requires = get_setup_py_python_requires(setup_args) extra_data.update(python_requires) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, @@ -678,6 +686,7 @@ def parse(cls, location): extra_data=extra_data, **urls, ) + yield models.PackageData.from_data(package_data, package_only) class ResolvedPurl(NamedTuple): @@ -694,7 +703,7 @@ class BaseDependencyFileHandler(models.DatafileHandler): """ @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): file_name = fileutils.file_name(location) dependency_type = get_dparse2_supported_file_name(file_name) @@ -705,12 +714,13 @@ def parse(cls, location): location=location, file_name=dependency_type, ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies, ) + yield models.PackageData.from_data(package_data, package_only) class SetupCfgHandler(BaseExtractedPythonLayout): @@ -722,7 +732,7 @@ class SetupCfgHandler(BaseExtractedPythonLayout): documentation_url = 'https://peps.python.org/pep-0390/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): metadata = {} parser = ConfigParser() @@ -801,7 +811,7 @@ def parse(cls, location): extracted_license_statement = '' extracted_license_statement += f" license_files: {license_file_references}" - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=metadata.get('name'), @@ -812,6 +822,7 @@ def parse(cls, location): dependencies=dependent_packages, extracted_license_statement=extracted_license_statement, ) + yield models.PackageData.from_data(package_data, package_only) @classmethod def parse_reqs(cls, reqs, scope): @@ -873,7 +884,7 @@ class PipfileLockHandler(BaseDependencyFileHandler): documentation_url = 'https://github.com/pypa/pipfile' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location) as f: content = f.read() @@ -890,13 +901,14 @@ def parse(cls, location): file_name='Pipfile.lock', ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, sha256=sha256, dependencies=dependent_packages, ) + yield models.PackageData.from_data(package_data, package_only) class PipRequirementsFileHandler(BaseDependencyFileHandler): @@ -919,15 +931,16 @@ class PipRequirementsFileHandler(BaseDependencyFileHandler): documentation_url = 'https://pip.pypa.io/en/latest/reference/requirements-file-format/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): dependencies, extra_data = get_requirements_txt_dependencies(location=location) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, primary_language=cls.default_primary_language, dependencies=dependencies, extra_data=extra_data, ) + yield models.PackageData.from_data(package_data, package_only) # TODO: enable nested load diff --git a/src/packagedcode/readme.py b/src/packagedcode/readme.py index 63741e86648..a03886929ba 100644 --- a/src/packagedcode/readme.py +++ b/src/packagedcode/readme.py @@ -55,11 +55,11 @@ class ReadmeHandler(models.NonAssemblableDatafileHandler): documentation_url = '' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location, encoding='utf-8') as loc: readme_manifest = loc.read() - package_data = build_package(readme_manifest) + package_data = build_package(readme_manifest, package_only) if not package_data.name: # If no name was detected for the Package, then we use the basename @@ -71,7 +71,7 @@ def parse(cls, location): yield package_data -def build_package(readme_manifest): +def build_package(readme_manifest, package_only=False): """ Return a Package object from a readme_manifest mapping (from a README.chromium file or similar) or None. @@ -104,5 +104,8 @@ def build_package(readme_manifest): continue setattr(package, package_key, value) - package.populate_license_fields() + if not package_only: + package.populate_license_fields() + package.populate_holder_field() + return package diff --git a/src/packagedcode/rpm.py b/src/packagedcode/rpm.py index bcbe8294070..a3314785fe2 100644 --- a/src/packagedcode/rpm.py +++ b/src/packagedcode/rpm.py @@ -124,7 +124,7 @@ def to_string(self): class BaseRpmInstalledDatabaseHandler(models.DatafileHandler): @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): # we receive the location of the Package database file and we need to # scan the parent which is the directory that contains the rpmdb loc_path = Path(location) @@ -136,6 +136,7 @@ def parse(cls, location): location=xmlish_loc, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) # TODO: package_data.namespace = cls.default_package_namespace return package_data @@ -274,7 +275,7 @@ class RpmArchiveHandler(models.DatafileHandler): documentation_url = 'https://en.wikipedia.org/wiki/RPM_Package_Manager' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): rpm_tags = get_rpm_tags(location, include_desc=True) if TRACE: logger_debug('recognize: rpm_tags', rpm_tags) @@ -351,7 +352,7 @@ def parse(cls, location): ) logger_debug('recognize: data to create a package:\n', data) - package = models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, # TODO: namespace=cls.default_package_namespace, @@ -365,9 +366,9 @@ def parse(cls, location): ) if TRACE: - logger_debug('recognize: created package:\n', package) + logger_debug('recognize: created package:\n', name) - yield package + yield models.PackageData.from_data(package_data, package_only) ALGO_BY_ID = { diff --git a/src/packagedcode/rpm_installed.py b/src/packagedcode/rpm_installed.py index 5d01436d2e1..8e6a12261c5 100644 --- a/src/packagedcode/rpm_installed.py +++ b/src/packagedcode/rpm_installed.py @@ -36,7 +36,7 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -def parse_rpm_xmlish(location, datasource_id, package_type): +def parse_rpm_xmlish(location, datasource_id, package_type, package_only=False): """ Yield PackageData built from an RPM XML'ish file at ``location``. This is a file created with the rpm CLI with the xml query option. @@ -58,6 +58,7 @@ def parse_rpm_xmlish(location, datasource_id, package_type): rpm_tags=tags, datasource_id=datasource_id, package_type=package_type, + package_only=package_only, ) @@ -133,7 +134,7 @@ def collect_tags(raw_tags): yield name, value_type, value -def build_package(rpm_tags, datasource_id, package_type, package_namespace=None): +def build_package(rpm_tags, datasource_id, package_type, package_namespace=None, package_only=False): """ Return a PackageData object from an ``rpm_tags`` iterable of (name, value_type, value) tuples. @@ -157,8 +158,12 @@ def build_package(rpm_tags, datasource_id, package_type, package_namespace=None) except Exception as e: raise Exception(value, converted) from e converted.update(handled) + + current_filerefs = converted.get("current_filerefs", None) + if current_filerefs: + converted.pop("current_filerefs") - package_data = models.PackageData.from_dict(converted) + package_data = models.PackageData.from_data(converted, package_only) return package_data ################################################################################ @@ -183,7 +188,10 @@ def handler(value, **kwargs): def size_handler(value, **kwargs): - return {'size': int(value)} + if not value == '0': + return {'size': int(value)} + else: + return {'size': None} def arch_handler(value, **kwargs): diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py index 6b77d6945d6..704005c0d21 100644 --- a/src/packagedcode/rubygems.py +++ b/src/packagedcode/rubygems.py @@ -41,12 +41,13 @@ class GemArchiveHandler(models.DatafileHandler): ) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): metadata = extract_gem_metadata(location) metadata = saneyaml.load(metadata) yield build_rubygem_package_data( gem_data=metadata, datasource_id=cls.datasource_id, + package_only=package_only, ) @@ -84,13 +85,14 @@ class GemMetadataArchiveExtractedHandler(models.DatafileHandler): ) @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location, 'rb') as met: metadata = met.read() metadata = saneyaml.load(metadata) yield build_rubygem_package_data( gem_data=metadata, datasource_id=cls.datasource_id, + package_only=package_only, ) @classmethod @@ -129,7 +131,7 @@ class GemspecHandler(models.DatafileHandler): documentation_url = 'https://guides.rubygems.org/specification-reference/' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): gemspec = spec.parse_spec( location=location, package_type=cls.default_package_type, @@ -152,7 +154,7 @@ def parse(cls, location): urls = get_urls(name=name, version=version) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -166,6 +168,7 @@ def parse(cls, location): dependencies=dependencies, **urls ) + yield models.PackageData.from_data(package_data, package_only) class GemspecInExtractedGemHandler(GemspecHandler): datasource_id = 'gemspec_extracted' @@ -234,7 +237,7 @@ class GemfileLockHandler(BaseGemProjectHandler): documentation_url = 'https://bundler.io/man/gemfile.5.html' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): gemfile_lock = GemfileLockParser(location) all_gems = list(gemfile_lock.all_gems.values()) if not all_gems: @@ -258,7 +261,7 @@ def parse(cls, location): ] urls = get_urls(primary_gem.name, primary_gem.version) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, primary_language=cls.default_primary_language, type=cls.default_package_type, @@ -267,6 +270,7 @@ def parse(cls, location): dependencies=deps, **urls ) + yield models.PackageData.from_data(package_data, package_only) else: deps = [ models.DependentPackage( @@ -284,12 +288,13 @@ def parse(cls, location): ) for gem in all_gems ] - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, dependencies=deps, primary_language=cls.default_primary_language, ) + yield models.PackageData.from_data(package_data, package_only) class GemfileLockInExtractedGemHandler(GemfileLockHandler): @@ -417,7 +422,7 @@ def extract_gem_metadata(location): fileutils.delete(extract_loc) -def build_rubygem_package_data(gem_data, datasource_id): +def build_rubygem_package_data(gem_data, datasource_id, package_only=False): """ Return a PackageData for ``datasource_id`` built from a Gem `gem_data` mapping or None. The ``gem_data`` can come from a .gemspec or .gem/metadata. @@ -461,7 +466,7 @@ def build_rubygem_package_data(gem_data, datasource_id): dependencies = get_dependencies(gem_data.get('dependencies')) file_references = get_file_references(metadata.get('files')) - package_data = models.PackageData( + package_mapping = dict( datasource_id=datasource_id, type=GemArchiveHandler.default_package_type, primary_language=GemArchiveHandler.default_primary_language, @@ -477,6 +482,7 @@ def build_rubygem_package_data(gem_data, datasource_id): dependencies=dependencies, **urls, ) + package_data = models.PackageData.from_data(package_mapping, package_only) # we can have one singular or a plural list of authors authors = gem_data.get('authors') or [] diff --git a/src/packagedcode/win_pe.py b/src/packagedcode/win_pe.py index a51ceb9496e..ce040eaedfe 100644 --- a/src/packagedcode/win_pe.py +++ b/src/packagedcode/win_pe.py @@ -276,7 +276,7 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): infos = pe_info(location) version = get_first( @@ -328,7 +328,7 @@ def parse(cls, location): parties = [Party(type=party_org, role='author', name=cname)] homepage_url = get_first(infos, 'URL', 'WWW') - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -340,3 +340,4 @@ def parse(cls, location): parties=parties, homepage_url=homepage_url, ) + yield models.PackageData.from_data(package_data, package_only) diff --git a/src/packagedcode/win_reg.py b/src/packagedcode/win_reg.py index e4e59cd3b1e..aac41691f01 100644 --- a/src/packagedcode/win_reg.py +++ b/src/packagedcode/win_reg.py @@ -77,6 +77,7 @@ def get_installed_dotnet_versions_from_hive( datasource_id, package_type, registry_path='\\Microsoft\\NET Framework Setup\\NDP', + package_only=False, ): """ Yield PackageData for the installed versions of .NET framework from the @@ -90,6 +91,7 @@ def get_installed_dotnet_versions_from_hive( registry_tree=registry_tree, datasource_id=datasource_id, package_type=package_type, + package_only=package_only, ) @@ -97,6 +99,7 @@ def get_installed_dotnet_versions_from_regtree( registry_tree, datasource_id, package_type, + package_only=False, ): """ Yield PackageData for the installed versions of .NET framework from a @@ -122,13 +125,14 @@ def get_installed_dotnet_versions_from_regtree( if key == 'InstallPath': file_references.append(models.FileReference(path=value)) - yield models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, name='microsoft-dot-net-framework', version=version, file_references=file_references, ) + yield models.PackageData.from_data(package_data, package_only) def get_installed_windows_programs_from_hive( @@ -136,6 +140,7 @@ def get_installed_windows_programs_from_hive( datasource_id, package_type, registry_path='\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + package_only=False, ): """ Yield installed Windows PackageData from a Windows registry file at @@ -151,6 +156,7 @@ def get_installed_windows_programs_from_hive( registry_tree=registry_tree, datasource_id=datasource_id, package_type=package_type, + package_only=package_only, ) @@ -158,6 +164,7 @@ def get_installed_windows_programs_from_regtree( registry_tree, datasource_id, package_type, + package_only=False, ): """ Yield installed Windows PackageData from a Windows ``registry_tree``. @@ -213,7 +220,7 @@ def get_installed_windows_programs_from_regtree( if uninstall_string: file_references.append(models.FileReference(path=uninstall_string)) - yield models.PackageData( + package_data = dict( datasource_id=datasource_id, type=package_type, name=name, @@ -222,12 +229,14 @@ def get_installed_windows_programs_from_regtree( homepage_url=homepage_url, file_references=file_references, ) + yield models.PackageData.from_data(package_data, package_only) def get_packages_from_registry_from_hive( location, datasource_id, package_type, + package_only=False, ): """ Yield PackageData for Installed Windows Programs from the Windows registry @@ -238,6 +247,7 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + package_only=package_only, ) yield from get_installed_windows_programs_from_hive( @@ -245,6 +255,7 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + package_only=package_only, ) yield from get_installed_dotnet_versions_from_hive( @@ -252,10 +263,11 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Microsoft\\NET Framework Setup\\NDP', + package_only=package_only, ) -def get_installed_packages(root_dir, is_container=True): +def get_installed_packages(root_dir, is_container=True, package_only=False): """ Yield PackageData for Installed Windows Programs for every detected installed program from Windows registry hive files found in well known @@ -280,7 +292,7 @@ def get_installed_packages(root_dir, is_container=True): for software_reg_loc, root_prefix in root_prefixes_by_software_reg_locations.items(): if not os.path.exists(software_reg_loc): continue - for package in get_packages_from_registry_from_hive(software_reg_loc): + for package in get_packages_from_registry_from_hive(software_reg_loc, package_only): package.populate_installed_files(root_dir, root_prefix=root_prefix) yield package @@ -342,11 +354,12 @@ class BaseRegInstalledProgramHandler(models.DatafileHandler): root_path_relative_to_datafile_path = None @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): yield from get_packages_from_registry_from_hive( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + package_only=package_only, ) @classmethod diff --git a/src/packagedcode/windows.py b/src/packagedcode/windows.py index 09d62f4c2d9..6fdb07815e3 100644 --- a/src/packagedcode/windows.py +++ b/src/packagedcode/windows.py @@ -20,7 +20,7 @@ class MicrosoftUpdateManifestHandler(models.NonAssemblableDatafileHandler): description = 'Microsoft Update Manifest .mum file' @classmethod - def parse(cls, location): + def parse(cls, location, package_only=False): with open(location , 'rb') as loc: parsed = xmltodict.parse(loc) @@ -47,7 +47,7 @@ def parse(cls, location): ) ) - yield models.PackageData( + package_data = dict( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, @@ -57,3 +57,4 @@ def parse(cls, location): parties=parties, copyright=copyrght, ) + yield models.PackageData.from_data(package_data, package_only) diff --git a/tests/packagedcode/data/build/buck/end2end-expected.json b/tests/packagedcode/data/build/buck/end2end-expected.json index 70f7b321317..2b169491ec3 100644 --- a/tests/packagedcode/data/build/buck/end2end-expected.json +++ b/tests/packagedcode/data/build/buck/end2end-expected.json @@ -256,9 +256,7 @@ "other_license_expression": null, "other_license_expression_spdx": null, "other_license_detections": [], - "extracted_license_statement": [ - "LICENSE" - ], + "extracted_license_statement": "- LICENSE\n", "notice_text": null, "source_packages": [], "file_references": [], diff --git a/tests/packagedcode/test_build.py b/tests/packagedcode/test_build.py index 3d0f438e551..147a075b15b 100644 --- a/tests/packagedcode/test_build.py +++ b/tests/packagedcode/test_build.py @@ -85,48 +85,46 @@ def test_BuckPackage_recognize_with_license(self): def test_MetadataBzl_parse(self): test_file = self.get_test_loc('metadatabzl/METADATA.bzl') - result_packages = build.BuckMetadataBzlHandler.parse(test_file) - expected_packages = [ - models.PackageData( - datasource_id=build.BuckMetadataBzlHandler.datasource_id, - type='github', - name='example', - version='0.0.1', - extracted_license_statement=['BSD-3-Clause'], - parties=[ - models.Party( - type=models.party_org, - name='oss_foundation', - role='maintainer' - ) - ], - homepage_url='https://github.com/example/example', - ), - ] + result_packages = build.BuckMetadataBzlHandler.parse(test_file, package_only=True) + package_data = dict( + datasource_id=build.BuckMetadataBzlHandler.datasource_id, + type='github', + name='example', + version='0.0.1', + extracted_license_statement=['BSD-3-Clause'], + parties=[ + models.Party( + type=models.party_org, + name='oss_foundation', + role='maintainer' + ) + ], + homepage_url='https://github.com/example/example', + ) + expected_packages = [models.PackageData.from_data(package_data=package_data, package_only=True)] compare_package_results(expected_packages, result_packages) def test_MetadataBzl_recognize_new_format(self): test_file = self.get_test_loc('metadatabzl/new-format/METADATA.bzl') - result_packages = build.BuckMetadataBzlHandler.parse(test_file) - expected_packages = [ - models.PackageData( - datasource_id=build.BuckMetadataBzlHandler.datasource_id, - type='github', - name='example/example', - version='0.0.1', - extracted_license_statement='BSD-3-Clause', - parties=[ - models.Party( - type=models.party_org, - name='example_org', - role='maintainer' - ) - ], - download_url='', - sha1='', - homepage_url='https://github.com/example/example', - vcs_url='https://github.com/example/example.git', - extra_data=dict(vcs_commit_hash="deadbeef") - ) - ] + result_packages = build.BuckMetadataBzlHandler.parse(test_file, package_only=True) + package_data = dict( + datasource_id=build.BuckMetadataBzlHandler.datasource_id, + type='github', + name='example/example', + version='0.0.1', + extracted_license_statement='BSD-3-Clause', + parties=[ + models.Party( + type=models.party_org, + name='example_org', + role='maintainer' + ) + ], + download_url='', + sha1='', + homepage_url='https://github.com/example/example', + vcs_url='https://github.com/example/example.git', + extra_data=dict(vcs_commit_hash="deadbeef") + ) + expected_packages = [models.PackageData.from_data(package_data=package_data, package_only=True)] compare_package_results(expected_packages, result_packages) diff --git a/tests/packagedcode/test_package_models.py b/tests/packagedcode/test_package_models.py index 98a3606093d..62369a9c115 100644 --- a/tests/packagedcode/test_package_models.py +++ b/tests/packagedcode/test_package_models.py @@ -76,7 +76,7 @@ def test_Package_creation_and_dump(self): assert list(pd.to_dict().items()) == expected def test_Package_simple(self): - package = PackageData( + package_mapping = dict( datasource_id = 'rpm_archive', type='rpm', name='Sample', @@ -86,8 +86,9 @@ def test_Package_simple(self): vcs_url='git+https://somerepo.com/that.git', extracted_license_statement='apache-2.0', ) + package_data = PackageData.from_data(package_data=package_mapping, package_only=False) expected_loc = 'models/simple-expected.json' - self.check_package_data(package, expected_loc, regen=REGEN_TEST_FIXTURES) + self.check_package_data(package_data, expected_loc, regen=REGEN_TEST_FIXTURES) def test_Package_model_qualifiers_are_serialized_as_mappings(self): package = models.PackageData( @@ -108,7 +109,7 @@ def test_Package_model_qualifiers_are_converted_to_mappings(self): assert package.qualifiers == dict(this='that') def test_Package_full(self): - package = PackageData( + package_mapping = dict( type='rpm', datasource_id = 'rpm_archive', namespace='fedora', @@ -136,8 +137,9 @@ def test_Package_full(self): notice_text='licensed under the apacche 2.0 \nlicense', source_packages=["pkg:maven/aspectj/aspectjtools@1.5.4?classifier=sources"], ) + package_data = PackageData.from_data(package_data=package_mapping, package_only=False) expected_loc = 'models/full-expected.json' - self.check_package_data(package, expected_loc, regen=REGEN_TEST_FIXTURES) + self.check_package_data(package_data, expected_loc, regen=REGEN_TEST_FIXTURES) def test_package_data_datasource_id_are_unique(self): """ @@ -177,11 +179,12 @@ def test_package_data_file_patterns_are_tuples(self): def test_add_to_package(self): test_loc = self.get_test_loc('npm/electron') - test_package = models.Package( + test_package_data = dict( type='npm', name='electron', version='3.1.11', ) + test_package = models.Package.from_data(test_package_data) test_package_uid = test_package.package_uid test_codebase = Codebase( location=test_loc, @@ -241,12 +244,13 @@ def test_create_package_not_handled_by_packagedcode(self): 'gpl', 'GNU General Public License version 2.0 (GPLv2)', ] - package = PackageData( + package_mapping = dict( type='sourceforge', name='openstunts', copyright='Copyright (c) openstunts project', extracted_license_statement=extracted_license_statement, ) + package = PackageData.from_data(package_data=package_mapping, package_only=False) # Test generated fields assert package.purl == 'pkg:sourceforge/openstunts' assert package.holder == 'openstunts project' diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 3076354074c..340d674bdc5 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -12,6 +12,9 @@ Options: -p, --package Scan for application package and dependency manifests, lockfiles and related data. --system-package Scan for installed system package databases. + --package-only Only detect package information and skip license/copyright + detection steps, in application package and dependency + manifests, lockfiles and related data. -c, --copyright Scan for copyrights. other scans: