diff --git a/src/packagedcode/about.py b/src/packagedcode/about.py index 3f33c6838db..9f9d4238861 100644 --- a/src/packagedcode/about.py +++ b/src/packagedcode/about.py @@ -47,7 +47,7 @@ class AboutFileHandler(models.DatafileHandler): documentation_url = 'https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -71,6 +71,15 @@ def parse(cls, location): name = package_data.get('name') version = package_data.get('version') + if purl_only: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=package_type, + namespace=package_ns, + name=name, + version=version, + ) + return homepage_url = package_data.get('home_url') or package_data.get('homepage_url') download_url = package_data.get('download_url') diff --git a/src/packagedcode/alpine.py b/src/packagedcode/alpine.py index 59223cf72da..58ebdfcc0e3 100644 --- a/src/packagedcode/alpine.py +++ b/src/packagedcode/alpine.py @@ -63,11 +63,12 @@ class AlpineInstalledDatabaseHandler(models.DatafileHandler): description = 'Alpine Linux installed package database' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield from parse_alpine_installed_db( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -134,9 +135,10 @@ class AlpineApkbuildHandler(models.DatafileHandler): documentation_url = 'https://wiki.alpinelinux.org/wiki/APKBUILD_Reference' @classmethod - def parse(cls, location): - package_data = parse_apkbuild(location, strict=True) - cls.populate_license_fields(package_data) + def parse(cls, location, purl_only=False): + package_data = parse_apkbuild(location, strict=True, purl_only=purl_only) + if not purl_only: + cls.populate_license_fields(package_data) if package_data: yield package_data @@ -165,7 +167,12 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) ) -def parse_alpine_installed_db(location, datasource_id, package_type): +def parse_alpine_installed_db( + location, + datasource_id, + package_type, + purl_only=False, + ): """ Yield PackageData objects from an installed database file at `location` or None. Typically found at '/lib/apk/db/installed' in an Alpine @@ -179,6 +186,7 @@ def parse_alpine_installed_db(location, datasource_id, package_type): package_fields=package_fields, datasource_id=datasource_id, package_type=package_type, + purl_only=purl_only, ) @@ -241,7 +249,7 @@ def get_alpine_installed_db_fields(location): ]) -def parse_apkbuild(location, strict=False): +def parse_apkbuild(location, strict=False, purl_only=False): """ Return a PackageData object from an APKBUILD file at ``location`` or None. @@ -256,6 +264,7 @@ def parse_apkbuild(location, strict=False): datasource_id=AlpineApkbuildHandler.datasource_id, package_type=AlpineApkbuildHandler.default_package_type, strict=strict, + purl_only=purl_only, ) @@ -732,7 +741,13 @@ def fix_apkbuild(text): return text -def parse_apkbuild_text(text, datasource_id, package_type, strict=False): +def parse_apkbuild_text( + text, + datasource_id, + package_type, + strict=False, + purl_only=False + ): """ Return a PackageData object from an APKBUILD text context or None. Only consider variables with a name listed in the ``names`` set. @@ -761,7 +776,8 @@ def parse_apkbuild_text(text, datasource_id, package_type, strict=False): package = build_package_data( variables, datasource_id=datasource_id, - package_type=package_type + package_type=package_type, + purl_only=purl_only, ) if package and unresolved: @@ -800,7 +816,12 @@ def parse_pkginfo(location): raise NotImplementedError -def build_package_data(package_fields, datasource_id, package_type): +def build_package_data( + package_fields, + datasource_id, + package_type, + purl_only=False + ): """ Return a PackageData object from a ``package_fields`` iterable of (name, value) tuples. @@ -832,10 +853,17 @@ def build_package_data(package_fields, datasource_id, package_type): 'type': package_type, } for name, value in package_fields: - handler = package_handlers_by_field_name.get(name) + handler = package_handlers_by_field_name_purl_only.get(name) + if not purl_only and not handler: + handler = package_handlers_by_field_name_others.get(name) + if handler: try: - converted = handler(value, all_fields=all_fields, **converted_fields) + converted = handler( + value, + all_fields=all_fields, + **converted_fields + ) except: raise Exception(*list(package_fields)) @@ -1199,11 +1227,11 @@ def source_handler(value, **kwargs): # mapping of: # - the package field one letter name in the installed db, # - an handler for that field -package_handlers_by_field_name = { +package_handlers_by_field_name_purl_only = { - ############################################################################ - # per-package fields - ############################################################################ + ########################################################################### + # per-package fields (only purl fields) + ########################################################################### # name of the package # For example: P:busybox @@ -1218,6 +1246,22 @@ def source_handler(value, **kwargs): 'V': build_name_value_str_handler('version'), 'pkgver': apkbuild_version_handler, + # For example: D:scanelf so:libc.musl-x86_64.so.1 + # For example: D:so:libc.musl-x86_64.so.1 so:libcrypto.so.1.1 so:libssl.so.1.1 so:libz.so.1 + # Can occur more than once + # 'depend' in .PKGINFO and APKBUILD + # TODO: add other dependencies (e.g. makedepends) + 'D': D_dependencies_handler, + 'depend': D_dependencies_handler, +} + + +package_handlers_by_field_name_others = { + + ########################################################################### + # per-package fields (other than purls) + ########################################################################### + # For example: T:Size optimized toolbox of many common UNIX utilities # 'pkgdesc' in .PKGINFO and APKBUILD 'T': build_name_value_str_handler('description'), @@ -1272,14 +1316,6 @@ def source_handler(value, **kwargs): 'c': c_git_commit_handler, 'commit': c_git_commit_handler, - # For example: D:scanelf so:libc.musl-x86_64.so.1 - # For example: D:so:libc.musl-x86_64.so.1 so:libcrypto.so.1.1 so:libssl.so.1.1 so:libz.so.1 - # Can occur more than once - # 'depend' in .PKGINFO and APKBUILD - # TODO: add other dependencies (e.g. makedepends) - 'D': D_dependencies_handler, - 'depend': D_dependencies_handler, - # For example: source="http://liba52.sourceforge.net/files/$pkgname-$pkgver.tar.gz # automake.patch # fix-globals-test-x86-pie.patch" diff --git a/src/packagedcode/bower.py b/src/packagedcode/bower.py index fe8c197cc87..d96f540a9e2 100644 --- a/src/packagedcode/bower.py +++ b/src/packagedcode/bower.py @@ -25,42 +25,13 @@ class BowerJsonHandler(models.DatafileHandler): documentation_url = 'https://bower.io' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) # note: having no name is not a problem for private packages. See #1514 name = package_data.get('name') - - description = package_data.get('description') version = package_data.get('version') - extracted_license_statement = package_data.get('license') - keywords = package_data.get('keywords') or [] - - parties = [] - - authors = package_data.get('authors') or [] - for author in authors: - if isinstance(author, dict): - name = author.get('name') - email = author.get('email') - url = author.get('homepage') - party = models.Party(name=name, role='author', email=email, url=url) - parties.append(party) - elif isinstance(author, str): - parties.append(models.Party(name=author, role='author')) - else: - parties.append(models.Party(name=repr(author), role='author')) - - homepage_url = package_data.get('homepage') - - repository = package_data.get('repository') or {} - repo_type = repository.get('type') - repo_url = repository.get('url') - - vcs_url = None - if repo_type and repo_url: - vcs_url = f'{repo_type}+{repo_url}' deps = package_data.get('dependencies') or {} dependencies = [] @@ -86,17 +57,52 @@ def parse(cls, location): is_optional=True, ) ) - - yield models.PackageData( + + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, - description=description, version=version, - extracted_license_statement=extracted_license_statement, - keywords=keywords, - parties=parties, - homepage_url=homepage_url, - vcs_url=vcs_url, - dependencies=dependencies + dependencies=dependencies, ) + if purl_only: + yield pkg + return + + description = package_data.get('description') + extracted_license_statement = package_data.get('license') + keywords = package_data.get('keywords') or [] + + parties = [] + + authors = package_data.get('authors') or [] + for author in authors: + if isinstance(author, dict): + name = author.get('name') + email = author.get('email') + url = author.get('homepage') + party = models.Party(name=name, role='author', email=email, url=url) + parties.append(party) + elif isinstance(author, str): + parties.append(models.Party(name=author, role='author')) + else: + parties.append(models.Party(name=repr(author), role='author')) + + homepage_url = package_data.get('homepage') + + repository = package_data.get('repository') or {} + repo_type = repository.get('type') + repo_url = repository.get('url') + + vcs_url = None + if repo_type and repo_url: + vcs_url = f'{repo_type}+{repo_url}' + + pkg.description = description + pkg.primary_language = BowerJsonHandler.default_primary_language + pkg.extracted_license_statement = extracted_license_statement + pkg.keywords = keywords + pkg.parties = parties + pkg.homepage_url = homepage_url + pkg.vcs_url = vcs_url + yield pkg diff --git a/src/packagedcode/build.py b/src/packagedcode/build.py index 7657f7c64b3..768fea3ea93 100644 --- a/src/packagedcode/build.py +++ b/src/packagedcode/build.py @@ -55,7 +55,7 @@ class AutotoolsConfigureHandler(models.DatafileHandler): documentation_url = 'https://www.gnu.org/software/automake/' @classmethod - def parse(cls, location): + def parse(cls, location, **kwargs): # we use the parent directory as a package name name = fileutils.file_name(fileutils.parent_directory(location)) # we could use checksums as version in the future @@ -143,7 +143,7 @@ def assemble(cls, package_data, resource, codebase, package_adder): yield resource @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # Thanks to Starlark being a Python dialect, we can use `ast` to parse it with open(location, 'rb') as f: @@ -191,18 +191,20 @@ def parse(cls, location): if not name: continue - license_files = args.get('licenses') - - if TRACE: - logger_debug(f"build: parse: license_files: {license_files}") - package_data = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, ) - package_data.extracted_license_statement = license_files + if not purl_only: + license_files = args.get('licenses') + package_data.extracted_license_statement = license_files + if TRACE: + logger_debug( + f"build: parse: license_files: {license_files}" + ) + yield package_data else: @@ -334,7 +336,7 @@ class BuckMetadataBzlHandler(BaseStarlarkManifestHandler): documentation_url = 'https://buck.build/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location, 'rb') as f: tree = ast.parse(f.read()) @@ -366,15 +368,16 @@ def parse(cls, location): metadata_fields[key_name] = value parties = [] - maintainers = metadata_fields.get('maintainers', []) or [] - for maintainer in maintainers: - parties.append( - models.Party( - type=models.party_org, - name=maintainer, - role='maintainer', + if not purl_only: + maintainers = metadata_fields.get('maintainers', []) or [] + for maintainer in maintainers: + parties.append( + models.Party( + type=models.party_org, + name=maintainer, + role='maintainer', + ) ) - ) if ( 'upstream_type' @@ -386,16 +389,19 @@ def parse(cls, location): ): # TODO: Create function that determines package type from download URL, # then create a package of that package type from the metadata info - yield models.PackageData( + + pkg = models.PackageData( datasource_id=cls.datasource_id, type=metadata_fields.get('upstream_type', cls.default_package_type), name=metadata_fields.get('name'), version=metadata_fields.get('version'), - extracted_license_statement=metadata_fields.get('licenses', []), - parties=parties, - homepage_url=metadata_fields.get('upstream_address', ''), # TODO: Store 'upstream_hash` somewhere ) + if not purl_only: + pkg.extracted_license_statement = metadata_fields.get('licenses', []) + pkg.parties = parties + pkg.homepage_url = metadata_fields.get('upstream_address', '') + yield pkg if ( 'package_type' @@ -409,19 +415,20 @@ def parse(cls, location): and 'vcs_commit_hash' in metadata_fields ): - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=metadata_fields.get('package_type', cls.default_package_type), name=metadata_fields.get('name'), version=metadata_fields.get('version'), - extracted_license_statement=metadata_fields.get('license_expression', ''), - parties=parties, - homepage_url=metadata_fields.get('homepage_url', ''), - download_url=metadata_fields.get('download_url', ''), - vcs_url=metadata_fields.get('vcs_url', ''), - sha1=metadata_fields.get('download_archive_sha1', ''), - extra_data=dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', '')) ) + pkg.extracted_license_statement = metadata_fields.get('license_expression', '') + pkg.parties = parties + pkg.homepage_url = metadata_fields.get('homepage_url', '') + pkg.download_url = metadata_fields.get('download_url', '') + pkg.vcs_url = metadata_fields.get('vcs_url', '') + pkg.sha1 = metadata_fields.get('download_archive_sha1', '') + pkg.extra_data = dict(vcs_commit_hash=metadata_fields.get('vcs_commit_hash', '')) + yield pkg @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/build_gradle.py b/src/packagedcode/build_gradle.py index 308abe5bc62..51669a4b962 100644 --- a/src/packagedcode/build_gradle.py +++ b/src/packagedcode/build_gradle.py @@ -59,9 +59,9 @@ class BuildGradleHandler(models.DatafileHandler): description = 'Gradle build script' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): dependencies = get_dependencies(location) - return build_package(cls, dependencies) + return build_package(cls, dependencies, purl_only=purl_only) # TODO: handle complex cases of nested builds with many packages @classmethod @@ -328,7 +328,7 @@ def get_dependencies(build_gradle_location): return list(get_dependencies_from_parse_tree(parse_tree)) -def build_package(cls, dependencies): +def build_package(cls, dependencies, purl_only=False): """ Yield PackageData from a ``dependencies`` list of mappings. """ @@ -364,10 +364,12 @@ def build_package(cls, dependencies): ) ) - yield models.PackageData( + + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=BuildGradleHandler.default_primary_language, dependencies=package_dependencies, ) - + if purl_only: + pkg.primary_language = BuildGradleHandler.default_primary_language + yield pkg \ No newline at end of file diff --git a/src/packagedcode/cargo.py b/src/packagedcode/cargo.py index ffe8dea5159..a6ddbd9a382 100644 --- a/src/packagedcode/cargo.py +++ b/src/packagedcode/cargo.py @@ -29,13 +29,31 @@ class CargoTomlHandler(models.DatafileHandler): documentation_url = 'https://doc.rust-lang.org/cargo/reference/manifest.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): package_data = toml.load(location, _dict=dict) core_package_data = package_data.get('package', {}) name = core_package_data.get('name') version = core_package_data.get('version') + + # cargo dependencies are complex and can be overriden at multiple levels + dependencies = [] + for key, value in core_package_data.items(): + if key.endswith('dependencies'): + dependencies.extend(dependency_mapper(dependencies=value, scope=key)) + + pkg = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + dependencies=dependencies, + ) + if purl_only: + yield pkg + return + description = core_package_data.get('description') or '' description = description.strip() @@ -50,12 +68,6 @@ def parse(cls, location): categories = core_package_data.get('categories') or [] keywords.extend(categories) - # cargo dependencies are complex and can be overriden at multiple levels - dependencies = [] - for key, value in core_package_data.items(): - if key.endswith('dependencies'): - dependencies.extend(dependency_mapper(dependencies=value, scope=key)) - # TODO: add file refs: # - readme, include and exclude # TODO: other URLs @@ -67,22 +79,17 @@ def parse(cls, location): repository_download_url = name and version and f'https://crates.io/api/v1/crates/{name}/{version}/download' api_data_url = name and f'https://crates.io/api/v1/crates/{name}' - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - primary_language=cls.default_primary_language, - description=description, - parties=parties, - extracted_license_statement=extracted_license_statement, - vcs_url=vcs_url, - homepage_url=homepage_url, - repository_homepage_url=repository_homepage_url, - repository_download_url=repository_download_url, - api_data_url=api_data_url, - dependencies=dependencies, - ) + pkg.primary_language = cls.default_primary_language + pkg.description = description + pkg.parties = parties + pkg.extracted_license_statement = extracted_license_statement + pkg.vcs_url = vcs_url + pkg.homepage_url = homepage_url + pkg.repository_homepage_url = repository_homepage_url + pkg.repository_download_url = repository_download_url + pkg.api_data_url = api_data_url + pkg.dependencies = dependencies + yield pkg @classmethod def assemble(cls, package_data, resource, codebase, package_adder): @@ -116,7 +123,7 @@ class CargoLockHandler(models.DatafileHandler): # ] @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): cargo_lock = toml.load(location, _dict=dict) dependencies = [] package = cargo_lock.get('package', []) @@ -137,12 +144,14 @@ def parse(cls, location): ) ) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=dependencies, ) + if not purl_only: + pkg.primary_language = cls.default_primary_language + yield pkg @classmethod def assemble(cls, package_data, resource, codebase, package_adder): diff --git a/src/packagedcode/chef.py b/src/packagedcode/chef.py index 5c378cdce4e..b5319ddd341 100644 --- a/src/packagedcode/chef.py +++ b/src/packagedcode/chef.py @@ -183,14 +183,18 @@ def is_datafile(cls, location, filetypes=tuple()): return not parent.endswith('dist-info') @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. """ with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) - yield build_package(package_data, datasource_id=cls.datasource_id) + yield build_package( + package_data, + datasource_id=cls.datasource_id, + purl_only=purl_only, + ) class ChefMetadataRbHandler(BaseChefMetadataHandler): @@ -202,7 +206,7 @@ class ChefMetadataRbHandler(BaseChefMetadataHandler): documentation_url = 'https://docs.chef.io/config_rb_metadata/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with io.open(location, encoding='utf-8') as loc: file_contents = loc.read() @@ -213,10 +217,14 @@ def parse(cls, location): ChefMetadataFormatter() ) package_data = json.loads(formatted_file_contents) - yield build_package(package_data, datasource_id=cls.datasource_id) + yield build_package( + package_data, + datasource_id=cls.datasource_id, + purl_only=purl_only, + ) -def build_package(package_data, datasource_id): +def build_package(package_data, datasource_id, purl_only=False): """ Return a PackageData object from a package_data mapping from a metadata.json or similar or None. @@ -224,6 +232,32 @@ def build_package(package_data, datasource_id): name = package_data.get('name') version = package_data.get('version') + deps = dict(package_data.get('dependencies', {}) or {}) + deps.update(package_data.get('depends', {}) or {}) + + dependencies = [] + for dependency_name, requirement in deps.items(): + dependencies.append( + models.DependentPackage( + purl=PackageURL(type='chef', name=dependency_name).to_string(), + scope='dependencies', + extracted_requirement=requirement, + is_runtime=True, + is_optional=False, + ) + ) + + pkg = models.PackageData( + datasource_id=datasource_id, + type=ChefMetadataJsonHandler.default_package_type, + name=name, + version=version, + dependencies=dependencies, + ) + if purl_only: + yield pkg + return + maintainer_name = package_data.get('maintainer', '') maintainer_email = package_data.get('maintainer_email', '') parties = [] @@ -246,32 +280,16 @@ def build_package(package_data, datasource_id): code_view_url = package_data.get('source_url', '') bug_tracking_url = package_data.get('issues_url', '') - deps = dict(package_data.get('dependencies', {}) or {}) - deps.update(package_data.get('depends', {}) or {}) - - dependencies = [] - for dependency_name, requirement in deps.items(): - dependencies.append( - models.DependentPackage( - purl=PackageURL(type='chef', name=dependency_name).to_string(), - scope='dependencies', - extracted_requirement=requirement, - is_runtime=True, - is_optional=False, - ) - ) - - return models.PackageData( - datasource_id=datasource_id, - type=ChefMetadataJsonHandler.default_package_type, - name=name, - version=version, - parties=parties, - description=description.strip() or None, - extracted_license_statement=extracted_license_statement, - code_view_url=code_view_url.strip() or None, - bug_tracking_url=bug_tracking_url.strip() or None, - dependencies=dependencies, - primary_language='Ruby', - **get_urls(name, version), - ) + pkg.parties = parties + pkg.description = description.strip() or None + pkg.extracted_license_statement = extracted_license_statement + pkg.code_view_url = code_view_url.strip() or None + pkg.bug_tracking_url = bug_tracking_url.strip() or None + pkg.primary_language = 'Ruby' + ( + pkg.download_url, + pkg.repository_download_url, + pkg.repository_homepage_url, + pkg.api_data_url + ) = get_urls(name, version) + yield pkg diff --git a/src/packagedcode/cocoapods.py b/src/packagedcode/cocoapods.py index 9773ae13638..a0a9c3fd8c0 100644 --- a/src/packagedcode/cocoapods.py +++ b/src/packagedcode/cocoapods.py @@ -216,7 +216,7 @@ class PodspecHandler(BasePodHandler): documentation_url = 'https://guides.cocoapods.org/syntax/podspec.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -224,10 +224,24 @@ def parse(cls, location): podspec = spec.parse_spec( location=location, package_type=cls.default_package_type, + purl_only=purl_only, ) name = podspec.get('name') version = podspec.get('version') + dependencies = podspec.get('dependencies') + + pkg = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + dependencies=dependencies, + ) + if purl_only: + yield pkg + return + homepage_url = podspec.get('homepage') extracted_license_statement = podspec.get('license') summary = podspec.get('summary') @@ -252,27 +266,27 @@ def parse(cls, location): ) parties.append(party) - urls = get_urls( + ( + pkg.repository_download_url, + pkg.repository_homepage_url, + pkg.code_view_url, + pkg.bug_tracking_url, + pkg.api_data_url, + ) = get_urls( name=name, version=version, homepage_url=homepage_url, - vcs_url=vcs_url) - - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - primary_language=cls.default_primary_language, vcs_url=vcs_url, - # FIXME: a source should be a PURL, not a list of URLs - # source_packages=vcs_url.split('\n'), - description=description, - extracted_license_statement=extracted_license_statement, - homepage_url=homepage_url, - parties=parties, - **urls, ) + pkg.primary_language = cls.default_primary_language + pkg.vcs_url = vcs_url + # FIXME: a source should be a PURL, not a list of URLs + # source_packages=vcs_url.split('\n'), + pkg.description = description + pkg.extracted_license_statement = extracted_license_statement + pkg.homepage_url = homepage_url + pkg.parties = parties + yield pkg class PodfileHandler(PodspecHandler): @@ -293,7 +307,7 @@ class PodfileLockHandler(BasePodHandler): documentation_url = 'https://guides.cocoapods.org/using/the-podfile.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield PackageData from a YAML Podfile.lock. """ @@ -337,12 +351,14 @@ def parse(cls, location): ) ) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=dependencies, ) + if not purl_only: + pkg.primary_language = cls.default_primary_language + yield pkg class PodspecJsonHandler(models.DatafileHandler): @@ -354,12 +370,27 @@ class PodspecJsonHandler(models.DatafileHandler): documentation_url = 'https://guides.cocoapods.org/syntax/podspec.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location) as psj: data = json.load(psj) name = data.get('name') version = data.get('version') + dependencies = data.get('dependencies', '') + if dependencies: + extra_data['dependencies'] = dependencies + + pkg = models.PackageData( + datasource_id=cls.datasource_id, + primary_language=cls.default_primary_language, + type=cls.default_package_type, + name=name, + version=version, + ) + if purl_only: + yield pkg + return + summary = data.get('summary', '') description = data.get('description', '') homepage_url = data.get('homepage') @@ -414,29 +445,25 @@ def parse(cls, location): extra_data = {} extra_data['source'] = data['source'] - dependencies = data.get('dependencies', '') - if dependencies: - extra_data['dependencies'] = dependencies extra_data['podspec.json'] = data - urls = get_urls( - name=name, - version=version, homepage_url=homepage_url, vcs_url=vcs_url) - - yield models.PackageData( - datasource_id=cls.datasource_id, - primary_language=cls.default_primary_language, - type=cls.default_package_type, + ( + pkg.repository_download_url, + pkg.repository_homepage_url, + pkg.code_view_url, + pkg.bug_tracking_url, + pkg.api_data_url, + ) = get_urls( name=name, version=version, - description=description, - extracted_license_statement=extracted_license_statement, - parties=parties, - vcs_url=vcs_url, homepage_url=homepage_url, - download_url=download_url, - **urls, + vcs_url=vcs_url, ) + pkg.primary_language = cls.default_primary_language + pkg.description = description + pkg.extracted_license_statement = extracted_license_statement + pkg.parties = parties + yield pkg def get_urls(name=None, version=None, homepage_url=None, vcs_url=None, **kwargs): diff --git a/src/packagedcode/conda.py b/src/packagedcode/conda.py index 545b3a1fb53..1715e20121c 100644 --- a/src/packagedcode/conda.py +++ b/src/packagedcode/conda.py @@ -79,7 +79,7 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) ) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): metayaml = get_meta_yaml_data(location) package_element = metayaml.get('package') or {} package_name = package_element.get('name') @@ -87,17 +87,6 @@ def parse(cls, location): return version = package_element.get('version') - # FIXME: source is source, not download - source = metayaml.get('source') or {} - download_url = source.get('url') - sha256 = source.get('sha256') - - about = metayaml.get('about') or {} - homepage_url = about.get('home') - extracted_license_statement = about.get('license') - description = about.get('summary') - vcs_url = about.get('dev_url') - dependencies = [] requirements = metayaml.get('requirements') or {} for scope, reqs in requirements.items(): @@ -118,20 +107,36 @@ def parse(cls, location): ) ) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=package_name, version=version, - download_url=download_url, - homepage_url=homepage_url, - vcs_url=vcs_url, - description=description, - sha256=sha256, - extracted_license_statement=extracted_license_statement, dependencies=dependencies, ) + if purl_only: + yield pkg + return + + # FIXME: source is source, not download + source = metayaml.get('source') or {} + download_url = source.get('url') + sha256 = source.get('sha256') + + about = metayaml.get('about') or {} + homepage_url = about.get('home') + extracted_license_statement = about.get('license') + description = about.get('summary') + vcs_url = about.get('dev_url') + + pkg.download_url = download_url + pkg.homepage_url = homepage_url + pkg.vcs_url = vcs_url + pkg.description = description + pkg.sha256 = sha256 + pkg.extracted_license_statement = extracted_license_statement + yield pkg def get_meta_yaml_data(location): """ diff --git a/src/packagedcode/cran.py b/src/packagedcode/cran.py index ef274105da6..ea68af26467 100644 --- a/src/packagedcode/cran.py +++ b/src/packagedcode/cran.py @@ -30,38 +30,12 @@ class CranDescriptionFileHandler(models.DatafileHandler): documentation_url = 'https://r-pkgs.org/description.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): cran_desc = get_cran_description(location) - name = cran_desc.get('Package') if not name: return - parties = [] - maintainers = cran_desc.get('Maintainer') or '' - for maintainer in maintainers.split(',\n'): - maintainer_name, maintainer_email = get_party_info(maintainer) - if maintainer_name or maintainer_email: - parties.append( - models.Party( - name=maintainer_name, - role='maintainer', - email=maintainer_email, - ) - ) - - authors = cran_desc.get('Author') or '' - for author in authors.split(',\n'): - author_name, author_email = get_party_info(author) - if author_name or author_email: - parties.append( - models.Party( - name=author_name, - role='author', - email=author_email, - ) - ) - package_dependencies = [] dependencies = cran_desc.get('Depends') or '' for dependency in dependencies.split(',\n'): @@ -88,23 +62,53 @@ def parse(cls, location): ) ) - extracted_license_statement = cran_desc.get('License') - - # TODO: Let's handle the release date as a Date type - # release_date = cran_desc.get('Date/Publication'), - - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, version=cran_desc.get('Version'), - # TODO: combine both together - description=cran_desc.get('Description', '') or cran_desc.get('Title', ''), - extracted_license_statement=extracted_license_statement, - parties=parties, dependencies=package_dependencies, - repository_homepage_url=f'https://cran.r-project.org/package={name}', ) + if purl_only: + yield pkg + return + + parties = [] + maintainers = cran_desc.get('Maintainer') or '' + for maintainer in maintainers.split(',\n'): + maintainer_name, maintainer_email = get_party_info(maintainer) + if maintainer_name or maintainer_email: + parties.append( + models.Party( + name=maintainer_name, + role='maintainer', + email=maintainer_email, + ) + ) + + authors = cran_desc.get('Author') or '' + for author in authors.split(',\n'): + author_name, author_email = get_party_info(author) + if author_name or author_email: + parties.append( + models.Party( + name=author_name, + role='author', + email=author_email, + ) + ) + + extracted_license_statement = cran_desc.get('License') + + # TODO: Let's handle the release date as a Date type + # release_date = cran_desc.get('Date/Publication'), + + # TODO: combine both together + pkg.description = cran_desc.get('Description', '') or cran_desc.get('Title', '') + pkg.extracted_license_statement = extracted_license_statement + pkg.parties = parties + pkg.repository_homepage_url = f'https://cran.r-project.org/package={name}' + yield pkg # FIXME: THIS IS NOT YAML but RFC 822 diff --git a/src/packagedcode/debian.py b/src/packagedcode/debian.py index f84fc9e9afe..37321454069 100644 --- a/src/packagedcode/debian.py +++ b/src/packagedcode/debian.py @@ -59,11 +59,12 @@ class DebianDebPackageHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield build_package_data_from_package_filename( filename=fileutils.file_name(location), datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -82,13 +83,14 @@ class DebianSourcePackageMetadataTarballHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # strip extension filename, _, _ = location.rpartition('.tar') yield build_package_data_from_package_filename( filename=filename, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -107,13 +109,14 @@ class DebianSourcePackageTarballHandler(models.DatafileHandler): documentation_url = 'https://manpages.debian.org/unstable/dpkg-dev/deb.5.en.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # strip extension filename, _, _ = location.rpartition('.tar') yield build_package_data_from_package_filename( filename=filename, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -131,12 +134,13 @@ class DebianControlFileInExtractedDebHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # TODO: we cannot know the distro from the name only yield build_package_data( debian_data=get_paragraph_data_from_file(location=location), datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -156,7 +160,7 @@ class DebianControlFileInSourceHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # TODO: we cannot know the distro from the name only # NOTE: a control file in a source repo or debina.tar tarball can contain more than one package for debian_data in get_paragraphs_data_from_file(location=location): @@ -164,6 +168,7 @@ def parse(cls, location): debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -185,7 +190,7 @@ class DebianDscFileHandler(models.DatafileHandler): documentation_url = 'https://wiki.debian.org/dsc' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # this is typically signed debian_data = get_paragraph_data_from_file( location=location, @@ -195,6 +200,7 @@ def parse(cls, location): debian_data=debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -211,7 +217,7 @@ class DebianInstalledStatusDatabaseHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # note that we do not know yet the distro at this stage # we could get it... but we get that later during assemble() for debian_data in get_paragraphs_data_from_file(location): @@ -219,6 +225,7 @@ def parse(cls, location): debian_data, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -372,7 +379,7 @@ class DebianDistrolessInstalledDatabaseHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/debian-policy/ch-controlfields.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield installed PackageData objects given a ``location`` var/lib/dpkg/status.d/ file as found in a distroless container @@ -385,6 +392,7 @@ def parse(cls, location): datasource_id=cls.datasource_id, package_type=cls.default_package_type, distro='distroless', + purl_only=purl_only, ) @classmethod @@ -451,11 +459,12 @@ class DebianInstalledFilelistHandler(models.DatafileHandler): description = 'Debian installed file paths list' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -477,11 +486,12 @@ class DebianInstalledMd5sumFilelistHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/manuals/debian-handbook/sect.package-meta-information.en.html#sect.configuration-scripts' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -502,11 +512,12 @@ class DebianMd5sumFilelistInPackageHandler(models.DatafileHandler): documentation_url = 'https://www.debian.org/doc/manuals/debian-handbook/sect.package-meta-information.en.html#sect.configuration-scripts' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): return parse_debian_files_list( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -517,7 +528,12 @@ def assign_package_to_resources(cls, package, resource, codebase, package_adder) return models.DatafileHandler.assign_package_to_resources(package, root, codebase, package_adder) -def build_package_data_from_package_filename(filename, datasource_id, package_type): +def build_package_data_from_package_filename( + filename, + datasource_id, + package_type, + **kwargs, +): """ Return a PackageData built from the filename of a Debian package archive. """ @@ -543,7 +559,12 @@ def build_package_data_from_package_filename(filename, datasource_id, package_ty ) -def parse_debian_files_list(location, datasource_id, package_type): +def parse_debian_files_list( + location, + datasource_id, + package_type, + purl_only=False, +): """ Yield PackageData from a list of file paths at locations such as an from a Debian installed .list or .md5sums file. @@ -578,16 +599,26 @@ def parse_debian_files_list(location, datasource_id, package_type): if not file_references: return - yield models.PackageData( + package = models.PackageData( datasource_id=datasource_id, type=package_type, name=name, qualifiers=qualifiers, - file_references=file_references, ) + if purl_only: + yield package + else: + package.file_references = file_references + yield package -def build_package_data(debian_data, datasource_id, package_type='deb', distro=None): +def build_package_data( + debian_data, + datasource_id, + package_type='deb', + distro=None, + purl_only=False, +): """ Return a PackageData object from a package_data mapping (from a dpkg status or similar file) or None. @@ -600,6 +631,17 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No if architecture: qualifiers['architecture'] = architecture + package = models.PackageData( + datasource_id=datasource_id, + type=package_type, + namespace=distro, + name=name, + version=version, + qualifiers=qualifiers, + ) + if purl_only: + return package + extra_data = {} # Multi-Arch can be: "foreign", "same", "allowed", "all", "optional" or # empty/non-present. See https://wiki.debian.org/Multiarch/HOWTO @@ -639,21 +681,14 @@ def build_package_data(debian_data, datasource_id, package_type='deb', distro=No source_packages.append(source_pkg_purl) - return models.PackageData( - datasource_id=datasource_id, - type=package_type, - namespace=distro, - name=name, - version=version, - qualifiers=qualifiers, - description=description, - homepage_url=homepage_url, - size=size, - source_packages=source_packages, - keywords=keywords, - parties=parties, - extra_data=extra_data, - ) + package.description = description + package.homepage_url = homepage_url + package.size = size + package.source_packages = source_packages + package.keywords = keywords + package.parties = parties + package.extra_data = extra_data + return package ignored_root_dirs = { diff --git a/src/packagedcode/debian_copyright.py b/src/packagedcode/debian_copyright.py index fe7aebc852d..f06a8246eee 100644 --- a/src/packagedcode/debian_copyright.py +++ b/src/packagedcode/debian_copyright.py @@ -93,12 +93,8 @@ def is_datafile(cls, location, filetypes=tuple(), strict=False): return True @classmethod - def parse(cls, location): - debian_copyright = parse_copyright_file(location) - license_fields = DebianLicenseFields.get_license_fields( - debian_copyright=debian_copyright - ) - + def parse(cls, location, purl_only=False): + # TODO: collect the upstream source package details # find a name... TODO: this should be pushed down to each handler @@ -109,19 +105,29 @@ def parse(cls, location): # no name otherwise for now name = None - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=name, - extracted_license_statement=license_fields.extracted_license_statement, - declared_license_expression=license_fields.declared_license_expression, - declared_license_expression_spdx=license_fields.declared_license_expression_spdx, - license_detections=license_fields.license_detections, - other_license_expression=license_fields.other_license_expression, - other_license_expression_spdx=license_fields.other_license_expression_spdx, - other_license_detections=license_fields.other_license_detections, - copyright=debian_copyright.get_copyright(), ) + if purl_only: + yield pkg + return + + debian_copyright = parse_copyright_file(location) + license_fields = DebianLicenseFields.get_license_fields( + debian_copyright=debian_copyright + ) + + pkg.extracted_license_statement = license_fields.extracted_license_statement + pkg.declared_license_expression = license_fields.declared_license_expression + pkg.declared_license_expression_spdx = license_fields.declared_license_expression_spdx + pkg.license_detections = license_fields.license_detections + pkg.other_license_expression = license_fields.other_license_expression + pkg.other_license_expression_spdx = license_fields.other_license_expression_spdx + pkg.other_license_detections = license_fields.other_license_detections + pkg.copyright = debian_copyright.get_copyright() + yield pkg @attr.s diff --git a/src/packagedcode/distro.py b/src/packagedcode/distro.py index 6336e6b3111..9c22dc5bf1d 100644 --- a/src/packagedcode/distro.py +++ b/src/packagedcode/distro.py @@ -25,7 +25,7 @@ class EtcOsReleaseHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://www.freedesktop.org/software/systemd/man/os-release.html' @classmethod - def parse(cls, location): + def parse(cls, location, **kwargs): distro = Distro.from_os_release_file(location) distro_identifier = distro.identifier pretty_name = distro.pretty_name and distro.pretty_name.lower() or '' diff --git a/src/packagedcode/freebsd.py b/src/packagedcode/freebsd.py index c1f5342b562..885ef100a10 100644 --- a/src/packagedcode/freebsd.py +++ b/src/packagedcode/freebsd.py @@ -52,7 +52,7 @@ class CompactManifestHandler(models.DatafileHandler): documentation_url = 'https://www.freebsd.org/cgi/man.cgi?pkg-create(8)#MANIFEST_FILE_DETAILS' @classmethod - def _parse(cls, yaml_data): + def _parse(cls, yaml_data, purl_only=False): package_data = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, @@ -63,9 +63,23 @@ def _parse(cls, yaml_data): ) # mapping of top level manifest items to the PackageData object field name - plain_fields = [ + purl_fields = [ ('name', 'name'), ('version', 'version'), + ] + + for source, target in purl_fields: + value = yaml_data.get(source) + if value: + if isinstance(value, str): + value = value.strip() + if value: + setattr(package_data, target, value) + + if purl_only: + return package_data + + plain_fields = [ ('www', 'homepage_url'), ('desc', 'description'), ('categories', 'keywords'), @@ -96,7 +110,6 @@ def _parse(cls, yaml_data): # license_mapper needs multiple fields license_mapper(yaml_data, package_data) - cls.populate_license_fields(package_data) if TRACE: @@ -107,15 +120,15 @@ def _parse(cls, yaml_data): return package_data @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ - Yield one or more Package manifest objects given a file ``location`` pointing to a - package archive, manifest or similar. + Yield one or more Package manifest objects given a file ``location`` + pointing to a package archive, manifest or similar. """ with io.open(location, encoding='utf-8') as loc: yaml_data = saneyaml.load(loc) - yield cls._parse(yaml_data) + yield cls._parse(yaml_data=yaml_data, purl_only=purl_only) @staticmethod def get_license_detections_and_expression(package_data): diff --git a/src/packagedcode/godeps.py b/src/packagedcode/godeps.py index 68ff54787ff..4bebfb7e055 100644 --- a/src/packagedcode/godeps.py +++ b/src/packagedcode/godeps.py @@ -38,7 +38,7 @@ class GodepsHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://github.com/tools/godep' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): godeps = Godep(location) if godeps.import_path: @@ -64,14 +64,20 @@ def parse(cls, location): ) ) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, namespace=namespace, name=name, - primary_language=cls.default_primary_language, dependencies=dependencies, - ) + ) + + if purl_only: + yield pkg + return + + pkg.primary_language = cls.default_primary_language + yield pkg @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/golang.py b/src/packagedcode/golang.py index 6075c713e86..21374b45f66 100644 --- a/src/packagedcode/golang.py +++ b/src/packagedcode/golang.py @@ -49,7 +49,7 @@ class GoModHandler(BaseGoModuleHandler): documentation_url = 'https://go.dev/ref/mod' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): gomods = go_mod.parse_gomod(location) dependencies = [] @@ -82,6 +82,18 @@ def parse(cls, location): name = gomods.name namespace = gomods.namespace + pkg = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + namespace=namespace, + dependencies=dependencies, + ) + + if purl_only: + yield pkg + return + homepage_url = f'https://pkg.go.dev/{gomods.namespace}/{gomods.name}' vcs_url = f'https://{gomods.namespace}/{gomods.name}.git' @@ -89,17 +101,11 @@ def parse(cls, location): if namespace and name: repository_homepage_url = f'https://pkg.go.dev/{namespace}/{name}' - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - namespace=namespace, - vcs_url=vcs_url, - homepage_url=homepage_url, - repository_homepage_url=repository_homepage_url, - dependencies=dependencies, - primary_language=cls.default_primary_language, - ) + pkg.vcs_url = vcs_url + pkg.homepage_url = homepage_url + pkg.repository_homepage_url = repository_homepage_url + pkg.primary_language = cls.default_primary_language + yield pkg class GoSumHandler(BaseGoModuleHandler): @@ -111,7 +117,7 @@ class GoSumHandler(BaseGoModuleHandler): documentation_url = 'https://go.dev/ref/mod#go-sum-files' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): gosums = go_mod.parse_gosum(location) package_dependencies = [] for gosum in gosums: @@ -126,9 +132,14 @@ def parse(cls, location): ) ) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, dependencies=package_dependencies, - primary_language=cls.default_primary_language, ) + + if purl_only: + yield pkg + else: + pkg.primary_language = cls.default_primary_language + yield pkg diff --git a/src/packagedcode/haxe.py b/src/packagedcode/haxe.py index 99f5be4ef7d..961792313de 100644 --- a/src/packagedcode/haxe.py +++ b/src/packagedcode/haxe.py @@ -45,7 +45,7 @@ class HaxelibJsonHandler(models.DatafileHandler): documentation_url = 'https://lib.haxe.org/documentation/creating-a-haxelib-package/' @classmethod - def _parse(cls, json_data): + def _parse(cls, json_data, purl_only=False): name = json_data.get('name') version = json_data.get('version') @@ -54,13 +54,28 @@ def _parse(cls, json_data): type=cls.default_package_type, name=name, version=version, - homepage_url=json_data.get('url'), - extracted_license_statement=json_data.get('license'), - keywords=json_data.get('tags'), - description=json_data.get('description'), - primary_language=cls.default_primary_language, ) + for dep_name, dep_version in json_data.get('dependencies', {}).items(): + dep_version = dep_version and dep_version.strip() + is_resolved = bool(dep_version) + dep_purl = PackageURL( + type=cls.default_package_type, + name=dep_name, + version=dep_version + ).to_string() + dep = models.DependentPackage(purl=dep_purl, is_resolved=is_resolved,) + package_data.dependencies.append(dep) + + if purl_only: + return package_data + + package_data.homepage_url = json_data.get('url') + package_data.extracted_license_statement = json_data.get('license') + package_data.keywords = json_data.get('tags') + package_data.description = json_data.get('description') + package_data.primary_language = cls.default_primary_language + if name and version: download_url = f'https://lib.haxe.org/p/{name}/{version}/download/' package_data.repository_download_url = download_url @@ -77,21 +92,10 @@ def _parse(cls, json_data): url='https://lib.haxe.org/u/{}'.format(contrib)) package_data.parties.append(party) - for dep_name, dep_version in json_data.get('dependencies', {}).items(): - dep_version = dep_version and dep_version.strip() - is_resolved = bool(dep_version) - dep_purl = PackageURL( - type=cls.default_package_type, - name=dep_name, - version=dep_version - ).to_string() - dep = models.DependentPackage(purl=dep_purl, is_resolved=is_resolved,) - package_data.dependencies.append(dep) - return package_data @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package_data archive, manifest or similar. @@ -111,4 +115,4 @@ def parse(cls, location): with io.open(location, encoding='utf-8') as loc: json_data = json.load(loc) - yield cls._parse(json_data) + yield cls._parse(json_data=json_data, purl_only=purl_only) diff --git a/src/packagedcode/jar_manifest.py b/src/packagedcode/jar_manifest.py index 48df5373eb1..8e56ce62ded 100644 --- a/src/packagedcode/jar_manifest.py +++ b/src/packagedcode/jar_manifest.py @@ -62,7 +62,7 @@ def parse_section(section): return data -def get_normalized_java_manifest_data(manifest_mapping): +def get_normalized_java_manifest_data(manifest_mapping, purl_only=False): """ Return a mapping of package-like data normalized from a mapping of the `manifest_mapping` data mapping or None. @@ -231,6 +231,10 @@ def dget(s): package['namespace'] = namespace package['name'] = name package['version'] = version + + if purl_only: + return package + package['description'] = description # licensing diff --git a/src/packagedcode/maven.py b/src/packagedcode/maven.py index b5d521dc44a..ab97bc70353 100644 --- a/src/packagedcode/maven.py +++ b/src/packagedcode/maven.py @@ -132,11 +132,14 @@ class JavaJarManifestHandler(MavenBasePackageHandler): documentation_url = 'https://docs.oracle.com/javase/tutorial/deployment/jar/manifestindex.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): sections = parse_manifest(location) if sections: main_section = sections[0] - manifest = get_normalized_java_manifest_data(main_section) + manifest = get_normalized_java_manifest_data( + manifest_mapping=main_section, + purl_only=purl_only + ) if manifest: yield models.PackageData(**manifest,) @@ -206,13 +209,19 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location, base_url='https://repo1.maven.org/maven2'): + def parse( + cls, + location, + base_url='https://repo1.maven.org/maven2', + purl_only=False, + ): package_data = parse( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, primary_language=cls.default_primary_language, base_url=base_url, + purl_only=purl_only, ) if package_data: yield package_data @@ -303,7 +312,7 @@ class MavenPomPropertiesHandler(models.NonAssemblableDatafileHandler): documentation_url = 'https://maven.apache.org/pom.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield PackageData from a pom.properties file (which is typically side- by-side with its pom file.) @@ -313,27 +322,36 @@ def parse(cls, location): if TRACE: logger.debug(f'MavenPomPropertiesHandler.parse: properties: {properties!r}') if properties: - yield from cls.parse_pom_properties(properties=properties) + yield from cls.parse_pom_properties( + properties=properties, + purl_only=purl_only, + ) @classmethod - def parse_pom_properties(cls, properties): + def parse_pom_properties(cls, properties, purl_only=False): namespace = properties.pop("groupId", None) name = properties.pop("artifactId", None) version = properties.pop("version", None) - if properties: - extra_data = dict(pom_properties=properties) - else: - extra_data = {} - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, name=name, namespace=namespace, version=version, - extra_data=extra_data, ) + if purl_only: + yield purl_only + return + + if properties: + extra_data = dict(pom_properties=properties) + else: + extra_data = {} + + pkg.primary_language = cls.default_primary_language + pkg.extra_data = extra_data + yield pkg def build_url( @@ -1189,6 +1207,7 @@ def parse( package_type, primary_language, base_url='https://repo1.maven.org/maven2', + purl_only=False, ): """ Return Packagedata objects from parsing a Maven pom file at `location` or @@ -1199,7 +1218,8 @@ def parse( package_type=package_type, primary_language=primary_language, location=location, - base_url=base_url + base_url=base_url, + purl_only=purl_only, ) if package: return package @@ -1212,6 +1232,7 @@ def _parse( location=None, text=None, base_url='https://repo1.maven.org/maven2', + purl_only=False, ): """ Yield Packagedata objects from parsing a Maven pom file at `location` or @@ -1249,6 +1270,18 @@ def _parse( group_id = pom.group_id artifact_id = pom.artifact_id + pkg = MavenPackageData( + datasource_id=datasource_id, + type=package_type, + namespace=group_id, + name=artifact_id, + version=version, + qualifiers=qualifiers or None, + ) + pkg.dependencies = get_dependencies(pom) + if purl_only: + return pkg + # craft a source package purl for the main binary source_packages = [] is_main_binary_jar = not classifier and all([group_id, artifact_id, version]) @@ -1283,23 +1316,22 @@ def _parse( )) # FIXME: there are still other data to map in a PackageData - return MavenPackageData( - datasource_id=datasource_id, - type=package_type, - primary_language=primary_language, - namespace=group_id, - name=artifact_id, - version=version, - qualifiers=qualifiers or None, - description=description or None, - homepage_url=pom.url or None, - extracted_license_statement=extracted_license_statement or None, - parties=get_parties(pom), - dependencies=get_dependencies(pom), - source_packages=source_packages, - bug_tracking_url=bug_tracking_url, - **urls, - ) + pkg.primary_language = primary_language + pkg.description = description or None + pkg.homepage_url = pom.url or None + pkg.extracted_license_statement = extracted_license_statement or None + pkg.parties = get_parties(pom) + pkg.source_packages = source_packages + pkg.bug_tracking_url = bug_tracking_url + ( + pkg.vcs_url, + pkg.code_view_url, + pkg.repository_homepage_url, + pkg.repository_download_url, + pkg.api_data_url, + ) = urls + return pkg + class MavenPackageData(models.PackageData): diff --git a/src/packagedcode/msi.py b/src/packagedcode/msi.py index 31deea3040b..16d97e24396 100644 --- a/src/packagedcode/msi.py +++ b/src/packagedcode/msi.py @@ -124,10 +124,23 @@ def create_package_data_from_msiinfo_results( msiinfo_results, datasource_id='msi_installer', package_type='msi', + purl_only=False, ): """ Return PackageData from a mapping of `msiinfo_results` """ + subject = msiinfo_results.pop('Subject', '') + name = subject + version = get_version_from_subject_line(subject) + pkg = models.PackageData( + datasource_id=datasource_id, + type=package_type, + name=name, + version=version, + ) + if purl_only: + return pkg + author_name = msiinfo_results.pop('Author', '') parties = [] if author_name: @@ -144,27 +157,18 @@ def create_package_data_from_msiinfo_results( # the time. Getting the version out of the `Subject` string is not # straightforward because the format of the string is usually different # between different MSIs - subject = msiinfo_results.pop('Subject', '') - name = subject - version = get_version_from_subject_line(subject) - description = msiinfo_results.pop('Comments', '') - keywords = msiinfo_results.pop('Keywords', []) - return models.PackageData( - datasource_id=datasource_id, - type=package_type, - name=name, - version=version, - description=description, - parties=parties, - keywords=keywords, - extra_data=msiinfo_results - ) + pkg.description = msiinfo_results.pop('Comments', '') + pkg.parties = parties + pkg.keywords = msiinfo_results.pop('Keywords', []) + pkg.extra_data = msiinfo_results + return pkg def msi_parse(location, datasource_id='msi_installer', package_type='msi', + purl_only=False, ): """ Return PackageData from ``location`` @@ -175,6 +179,7 @@ def msi_parse(location, msiinfo_results=info, datasource_id=datasource_id, package_type=package_type, + purl_only=purl_only, ) else: return models.PackageData( @@ -192,5 +197,5 @@ class MsiInstallerHandler(models.DatafileHandler): documentation_url = 'https://docs.microsoft.com/en-us/windows/win32/msi/windows-installer-portal' @classmethod - def parse(cls, location): - yield msi_parse(location) + def parse(cls, location, purl_only=False): + yield msi_parse(location=location, purl_only=purl_only) diff --git a/src/packagedcode/npm.py b/src/packagedcode/npm.py index 6059850448e..91cdbf61d5f 100644 --- a/src/packagedcode/npm.py +++ b/src/packagedcode/npm.py @@ -185,34 +185,55 @@ class NpmPackageJsonHandler(BaseNpmHandler): documentation_url = 'https://docs.npmjs.com/cli/v8/configuring-npm/package-json' @classmethod - def _parse(cls, json_data): + def _parse(cls, json_data, purl_only=False): name = json_data.get('name') version = json_data.get('version') - homepage_url = json_data.get('homepage', '') - - # a package.json without name and version can be a private package - - if homepage_url and isinstance(homepage_url, list): - # TODO: should we keep other URLs - homepage_url = homepage_url[0] - homepage_url = homepage_url.strip() or None - namespace, name = split_scoped_package_name(name) - urls = get_urls(namespace, name, version) package = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, namespace=namespace or None, name=name, version=version or None, - description=json_data.get('description', '').strip() or None, - homepage_url=homepage_url, - **urls, ) + deps_mappers = [ + ('dependencies', partial(deps_mapper, field_name='dependencies')), + ('devDependencies', partial(deps_mapper, field_name='devDependencies')), + ('peerDependencies', partial(deps_mapper, field_name='peerDependencies')), + ('optionalDependencies', partial(deps_mapper, field_name='optionalDependencies')), + ('bundledDependencies', bundle_deps_mapper), + ] + for source, func in deps_mappers: + value = json_data.get(source) or None + if value: + if isinstance(value, str): + value = value.strip() + if value: + func(value, package) + + if purl_only: + return package + + homepage_url = json_data.get('homepage', '') + # a package.json without name and version can be a private package + + if homepage_url and isinstance(homepage_url, list): + # TODO: should we keep other URLs + homepage_url = homepage_url[0] + homepage_url = homepage_url.strip() or None + + ( + package.repository_homepage_url, + package.repository_download_url, + package.api_data_url, + ) = get_urls(namespace, name, version) + package.primary_language = cls.default_primary_language + package.description = json_data.get('description', '').strip() or None + package.homepage_url = homepage_url vcs_revision = json_data.get('gitHead') or None + # mapping of top level package.json items to a function accepting as # arguments the package.json element value and returning an iterable of (key, # values) to update on a package @@ -220,12 +241,6 @@ def _parse(cls, json_data): ('author', partial(party_mapper, party_type='author')), ('contributors', partial(party_mapper, party_type='contributor')), ('maintainers', partial(party_mapper, party_type='maintainer')), - - ('dependencies', partial(deps_mapper, field_name='dependencies')), - ('devDependencies', partial(deps_mapper, field_name='devDependencies')), - ('peerDependencies', partial(deps_mapper, field_name='peerDependencies')), - ('optionalDependencies', partial(deps_mapper, field_name='optionalDependencies')), - ('bundledDependencies', bundle_deps_mapper), ('repository', partial(vcs_repository_mapper, vcs_revision=vcs_revision)), ('keywords', keywords_mapper,), ('bugs', bugs_mapper), @@ -257,17 +272,17 @@ def _parse(cls, json_data): return package @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with io.open(location, encoding='utf-8') as loc: json_data = json.load(loc) - yield cls._parse(json_data) + yield cls._parse(json_data=json_data, purl_only=purl_only) class BaseNpmLockHandler(BaseNpmHandler): @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) @@ -278,17 +293,13 @@ def parse(cls, location): root_version = package_data.get('version') root_ns, _ , root_name = root_name.rpartition('/') - extra_data = dict(lockfile_version=lockfile_version) # this is the top level element that we return root_package_data = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, namespace=root_ns, name=root_name, version=root_version, - extra_data=extra_data, - **get_urls(root_ns, root_name, root_version) ) # https://docs.npmjs.com/cli/v8/configuring-npm/package-lock-json#lockfileversion @@ -299,7 +310,6 @@ def parse(cls, location): deps_key = 'packages' deps_mapping = package_data.get(deps_key) or {} - dependencies = [] for dep, dep_data in deps_mapping.items(): @@ -359,19 +369,29 @@ def parse(cls, location): integrity = dep_data.get('integrity') misc.update(get_algo_hexsum(integrity).items()) - resolved_package = models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - primary_language=cls.default_primary_language, - namespace=ns, - name=name, - version=version, - extracted_license_statement=extracted_license_statement, - **misc, - ) - # these are paths t the root of the installed package in v2 - if dep: - resolved_package.file_references = [models.FileReference(path=dep)], + if purl_only: + resolved_package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + namespace=ns, + name=name, + version=version, + ) + else: + resolved_package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + primary_language=cls.default_primary_language, + namespace=ns, + name=name, + version=version, + extracted_license_statement=extracted_license_statement, + **misc, + ) + + # these are paths t the root of the installed package in v2 + if dep: + resolved_package.file_references = [models.FileReference(path=dep)], # v1 as name/constraint pairs subrequires = dep_data.get('requires') or {} @@ -416,6 +436,17 @@ def parse(cls, location): root_package_data.dependencies = dependencies + if purl_only: + yield root_package_data + return + + root_package_data.primary_language = cls.default_primary_language + root_package_data.extra_data = dict(lockfile_version=lockfile_version) + ( + root_package_data.repository_homepage_url, + root_package_data.repository_download_url, + root_package_data.api_data_url, + ) = get_urls(root_ns, root_name, root_version) yield root_package_data @@ -490,7 +521,7 @@ def is_datafile(cls, location, filetypes=tuple()): return super().is_datafile(location, filetypes=filetypes) and is_yarn_v2(location) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Parse a bew yarn.lock v2 YAML format which looks like this: @@ -545,12 +576,16 @@ def parse(cls, location): ) top_dependencies.append(dependency) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=top_dependencies, ) + if purl_only: + yield pkg + else: + pkg.primary_language = cls.default_primary_language + yield pkg class YarnLockV1Handler(BaseNpmHandler): @@ -569,7 +604,7 @@ def is_datafile(cls, location, filetypes=tuple()): return super().is_datafile(location, filetypes=filetypes) and not is_yarn_v2(location) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Parse a classic yarn.lock format which looks like this: "@babel/core@^7.1.0", "@babel/core@^7.3.4": @@ -651,15 +686,24 @@ def parse(cls, location): misc.update(get_algo_hexsum(integrity).items()) # we create a resolve package with the details - resolved_package_data = models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - namespace=ns, - name=name, - version=version, - primary_language=cls.default_primary_language, - **misc, - ) + if purl_only: + resolved_package_data = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + namespace=ns, + name=name, + version=version, + ) + else: + resolved_package_data = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + namespace=ns, + name=name, + version=version, + primary_language=cls.default_primary_language, + **misc, + ) # we add the sub-deps to the resolved package for subns, subname, subconstraint in sub_dependencies: @@ -695,12 +739,17 @@ def parse(cls, location): ) dependencies.append(dep) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=dependencies, ) + if purl_only: + yield pkg + else: + pkg.primary_language = cls.default_primary_language + yield pkg + def get_checksum_and_url(url): diff --git a/src/packagedcode/nuget.py b/src/packagedcode/nuget.py index dee1afb39d5..adf4a9abe06 100644 --- a/src/packagedcode/nuget.py +++ b/src/packagedcode/nuget.py @@ -106,7 +106,7 @@ class NugetNuspecHandler(models.DatafileHandler): documentation_url = 'https://docs.microsoft.com/en-us/nuget/reference/nuspec' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location, 'rb') as loc: parsed = xmltodict.parse(loc) @@ -121,6 +121,16 @@ def parse(cls, location): name = nuspec.get('id') version = nuspec.get('version') + pkg = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + ) + if purl_only: + yield pkg + return + # Summary: A short description of the package for UI display. If omitted, a # truncated version of description is used. description = build_description(nuspec.get('summary'), nuspec.get('description')) @@ -152,7 +162,11 @@ def parse(cls, location): else: vcs_url = vcs_repository - urls = get_urls(name, version) + ( + pkg.repository_homepage_url, + pkg.repository_download_url, + pkg.api_data_url, + ) = get_urls(name, version) extracted_license_statement = None # See https://docs.microsoft.com/en-us/nuget/reference/nuspec#license @@ -163,18 +177,11 @@ def parse(cls, location): elif 'licenseUrl' in nuspec: extracted_license_statement = nuspec.get('licenseUrl') - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - description=description or None, - homepage_url=nuspec.get('projectUrl') or None, - parties=parties, - dependencies=list(get_dependencies(nuspec)), - extracted_license_statement=extracted_license_statement, - copyright=nuspec.get('copyright') or None, - vcs_url=vcs_url, - **urls, - ) - + pkg.description = description or None + pkg.homepage_url = nuspec.get('projectUrl') or None + pkg.parties = parties + pkg.dependencies = list(get_dependencies(nuspec)) + pkg.extracted_license_statement = extracted_license_statement + pkg.copyright = nuspec.get('copyright') or None + pkg.vcs_url = vcs_url + yield pkg diff --git a/src/packagedcode/opam.py b/src/packagedcode/opam.py index 94e2f4942d7..9e6957487e4 100644 --- a/src/packagedcode/opam.py +++ b/src/packagedcode/opam.py @@ -31,7 +31,7 @@ def get_package_root(cls, resource, codebase): return resource.parent(codebase) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): opams = parse_opam(location) package_dependencies = [] @@ -51,24 +51,35 @@ def parse(cls, location): name = opams.get('name') version = opams.get('version') - homepage_url = opams.get('homepage') - download_url = opams.get('src') - vcs_url = opams.get('dev-repo') - bug_tracking_url = opams.get('bug-reports') - extracted_license_statement = opams.get('license') - sha1 = opams.get('sha1') - md5 = opams.get('md5') - sha256 = opams.get('sha256') - sha512 = opams.get('sha512') - repository_homepage_url = get_repository_homepage_url(name) - api_data_url = get_api_data_url(name, version) + pkg = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + dependencies=package_dependencies, + ) + if purl_only: + yield pkg + return + + pkg.homepage_url = opams.get('homepage') + pkg.download_url = opams.get('src') + pkg.vcs_url = opams.get('dev-repo') + pkg.bug_tracking_url = opams.get('bug-reports') + pkg.extracted_license_statement = opams.get('license') + pkg.sha1 = opams.get('sha1') + pkg.md5 = opams.get('md5') + pkg.sha256 = opams.get('sha256') + pkg.sha512 = opams.get('sha512') + pkg.repository_homepage_url = get_repository_homepage_url(name) + pkg.api_data_url = get_api_data_url(name, version) short_desc = opams.get('synopsis') or '' long_desc = opams.get('description') or '' if long_desc == short_desc: long_desc = None descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) + pkg.description = '\n'.join(descriptions) parties = [] authors = opams.get('authors') or [] @@ -90,27 +101,9 @@ def parse(cls, location): ) ) - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - vcs_url=vcs_url, - homepage_url=homepage_url, - download_url=download_url, - sha1=sha1, - md5=md5, - sha256=sha256, - sha512=sha512, - bug_tracking_url=bug_tracking_url, - extracted_license_statement=extracted_license_statement, - description=description, - parties=parties, - dependencies=package_dependencies, - api_data_url=api_data_url, - repository_homepage_url=repository_homepage_url, - primary_language=cls.default_primary_language - ) + pkg.parties = parties + pkg.primary_language = cls.default_primary_language + yield pkg @classmethod def assign_package_to_resources(cls, package, resource, codebase, package_adder): diff --git a/src/packagedcode/phpcomposer.py b/src/packagedcode/phpcomposer.py index b729e0bfc25..86f7a0b7d45 100644 --- a/src/packagedcode/phpcomposer.py +++ b/src/packagedcode/phpcomposer.py @@ -58,7 +58,7 @@ class PhpComposerJsonHandler(BasePhpComposerHandler): documentation_url = 'https://getcomposer.org/doc/04-schema.md' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): """ Yield one or more Package manifest objects given a file ``location`` pointing to a package archive, manifest or similar. @@ -69,7 +69,10 @@ def parse(cls, location): with io.open(location, encoding='utf-8') as loc: package_json = json.load(loc) - yield build_package_data(package_json) + yield build_package_data( + package_json=package_json, + purl_only=purl_only + ) def get_repository_homepage_url(namespace, name): @@ -86,7 +89,7 @@ def get_api_data_url(namespace, name): return f'https://packagist.org/p/packages/{name}.json' -def build_package_data(package_data): +def build_package_data(package_data, purl_only=False): # Note: A composer.json without name and description is not a usable PHP # composer package. Name and description fields are required but only for @@ -108,10 +111,13 @@ def build_package_data(package_data): type=PhpComposerJsonHandler.default_package_type, namespace=ns, name=name, - repository_homepage_url=get_repository_homepage_url(ns, name), - api_data_url=get_api_data_url(ns, name), - primary_language=PhpComposerJsonHandler.default_primary_language, ) + if purl_only: + return package + + package.repository_homepage_url = get_repository_homepage_url(ns, name) + package.api_data_url = get_api_data_url(ns, name) + package.primary_language = PhpComposerJsonHandler.default_primary_language # mapping of top level composer.json items to the Package object field name plain_fields = [ @@ -170,16 +176,16 @@ class PhpComposerLockHandler(BasePhpComposerHandler): documentation_url = 'https://getcomposer.org/doc/01-basic-usage.md#commit-your-composer-lock-file-to-version-control' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with io.open(location, encoding='utf-8') as loc: package_data = json.load(loc) packages = [ - build_package_data(p) + build_package_data(package_data=p, purl_only=purl_only) for p in package_data.get('packages', []) ] packages_dev = [ - build_package_data(p) + build_package_data(package_data=p, purl_only=purl_only) for p in package_data.get('packages-dev', []) ] @@ -192,12 +198,16 @@ def parse(cls, location): for p in packages_dev ] - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=required_deps + required_dev_deps ) + if purl_only: + yield pkg + else: + pkg.primary_language = cls.default_primary_language + yield pkg for package in packages + packages_dev: yield package diff --git a/src/packagedcode/plugin_package.py b/src/packagedcode/plugin_package.py index 2e1e4ad6e07..c994a3d762e 100644 --- a/src/packagedcode/plugin_package.py +++ b/src/packagedcode/plugin_package.py @@ -159,7 +159,16 @@ class PackageScanner(ScanPlugin): help_group=SCAN_GROUP, sort_order=21, ), - + PluggableCommandLineOption( + ( + '--purl', + ), + is_flag=True, + default=False, + help='Only detect PURL fields in application package and dependency manifests, lockfiles and related data.', + help_group=SCAN_GROUP, + sort_order=22, + ), PluggableCommandLineOption( ('--list-packages',), is_flag=True, @@ -170,10 +179,10 @@ class PackageScanner(ScanPlugin): ), ] - def is_enabled(self, package, system_package, **kwargs): - return package or system_package + def is_enabled(self, package, system_package, purl, **kwargs): + return package or system_package or purl - def get_scanner(self, package=True, system_package=False, **kwargs): + def get_scanner(self, package=True, system_package=False, purl=False, **kwargs): """ Return a scanner callable to scan a file for package data. """ @@ -183,9 +192,10 @@ def get_scanner(self, package=True, system_package=False, **kwargs): get_package_data, application=package, system=system_package, + purl_only=purl, ) - def process_codebase(self, codebase, strip_root=False, **kwargs): + def process_codebase(self, codebase, strip_root=False, purl=False, **kwargs): """ Populate the ``codebase`` top level ``packages`` and ``dependencies`` with package and dependency instances, assembling parsed package data @@ -194,6 +204,11 @@ def process_codebase(self, codebase, strip_root=False, **kwargs): Also perform additional package license detection that depends on either file license detection or the package detections. """ + # If we only want purls, we want to skip both the package + # assembly and the extra package license detection steps + if purl: + return + has_licenses = hasattr(codebase.root, 'license_detections') # These steps add proper license detections to package_data and hence diff --git a/src/packagedcode/pubspec.py b/src/packagedcode/pubspec.py index a6abd5b8642..34285f0202a 100644 --- a/src/packagedcode/pubspec.py +++ b/src/packagedcode/pubspec.py @@ -60,11 +60,14 @@ class DartPubspecYamlHandler(BaseDartPubspecHandler): documentation_url = 'https://dart.dev/tools/pub/pubspec' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location) as inp: pubspec_data = saneyaml.load(inp.read()) - package_data = build_package(pubspec_data) + package_data = build_package( + pubspec_data=pubspec_data, + purl_only=purl_only + ) if package_data: yield package_data @@ -78,18 +81,22 @@ class DartPubspecLockHandler(BaseDartPubspecHandler): documentation_url = 'https://web.archive.org/web/20220330081004/https://gpalma.pt/blog/what-is-the-pubspec-lock/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location) as inp: locks_data = saneyaml.load(inp.read()) dependencies = list(collect_locks(locks_data)) - yield models.PackageData( + pkg = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=dependencies ) + if purl_only: + yield pkg + else: + pkg.primary_language = cls.default_primary_language + yield pkg def collect_locks(locks_data): @@ -238,44 +245,12 @@ def build_dep(name, version, scope, is_runtime=True, is_optional=False): return dep -def build_package(pubspec_data): +def build_package(pubspec_data, purl_only=False): """ Return a package object from a package data mapping or None """ name = pubspec_data.get('name') version = pubspec_data.get('version') - description = pubspec_data.get('description') - homepage_url = pubspec_data.get('homepage') - extracted_license_statement = pubspec_data.get('license') - vcs_url = pubspec_data.get('repository') - download_url = pubspec_data.get('archive_url') - - api_data_url = name and version and f'https://pub.dev/api/packages/{name}/versions/{version}' - repository_homepage_url = name and version and f'https://pub.dev/packages/{name}/versions/{version}' - - # A URL should be in the form of: - # https://pub.dartlang.org/packages/url_launcher/versions/6.0.9.tar.gz - # And it may resolve to: - # https://storage.googleapis.com/pub-packages/packages/http-0.13.2.tar.gz - # as seen in the pub.dev web pages - repository_download_url = name and version and f'https://pub.dartlang.org/packages/{name}/versions/{version}.tar.gz' - - download_url = download_url or repository_download_url - - # Author and authors are deprecated - authors = [] - author = pubspec_data.get('author') - if author: - authors.append(author) - authors.extend(pubspec_data.get('authors') or []) - - parties = [] - for auth in authors: - parties.append(models.Party( - type=models.party_person, - role='author', - name=auth - )) package_dependencies = [] dependencies = collect_deps( @@ -302,6 +277,51 @@ def build_package(pubspec_data): ) package_dependencies.extend(env_dependencies) + package = models.PackageData( + datasource_id=DartPubspecYamlHandler.datasource_id, + type=DartPubspecYamlHandler.default_primary_language, + name=name, + version=version, + dependencies=package_dependencies, + ) + if purl_only: + return package + + package.primary_language = DartPubspecYamlHandler.default_primary_language + package.description = pubspec_data.get('description') + package.homepage_url = pubspec_data.get('homepage') + package.extracted_license_statement = pubspec_data.get('license') + package.vcs_url = pubspec_data.get('repository') + package.download_url = pubspec_data.get('archive_url') + + package.api_data_url = name and version and f'https://pub.dev/api/packages/{name}/versions/{version}' + package.repository_homepage_url = name and version and f'https://pub.dev/packages/{name}/versions/{version}' + + # A URL should be in the form of: + # https://pub.dartlang.org/packages/url_launcher/versions/6.0.9.tar.gz + # And it may resolve to: + # https://storage.googleapis.com/pub-packages/packages/http-0.13.2.tar.gz + # as seen in the pub.dev web pages + package.repository_download_url = name and version and f'https://pub.dartlang.org/packages/{name}/versions/{version}.tar.gz' + package.download_url = package.download_url or package.repository_download_url + + # Author and authors are deprecated + authors = [] + author = pubspec_data.get('author') + if author: + authors.append(author) + authors.extend(pubspec_data.get('authors') or []) + + parties = [] + for auth in authors: + parties.append(models.Party( + type=models.party_person, + role='author', + name=auth + )) + + package.parties = parties + extra_data = {} def add_to_extra_if_present(_key): @@ -314,22 +334,6 @@ def add_to_extra_if_present(_key): add_to_extra_if_present('dependencies_overrides') add_to_extra_if_present('executables') add_to_extra_if_present('publish_to') + package.extra_data = extra_data - return models.PackageData( - datasource_id=DartPubspecYamlHandler.datasource_id, - type=DartPubspecYamlHandler.default_primary_language, - primary_language=DartPubspecYamlHandler.default_primary_language, - name=name, - version=version, - download_url=download_url, - vcs_url=vcs_url, - description=description, - extracted_license_statement=extracted_license_statement, - parties=parties, - homepage_url=homepage_url, - dependencies=package_dependencies, - extra_data=extra_data, - repository_homepage_url=repository_homepage_url, - api_data_url=api_data_url, - repository_download_url=repository_download_url, - ) + return package diff --git a/src/packagedcode/pypi.py b/src/packagedcode/pypi.py index cde769d4de1..b80334e4db3 100644 --- a/src/packagedcode/pypi.py +++ b/src/packagedcode/pypi.py @@ -79,11 +79,12 @@ class PythonEggPkgInfoFile(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0376/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -103,11 +104,12 @@ class PythonEditableInstallationPkgInfoFile(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0376/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -320,11 +322,12 @@ def is_datafile(cls, location): ) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @@ -337,11 +340,12 @@ class PythonInstalledWheelMetadataFile(models.DatafileHandler): documentation_url = 'https://packaging.python.org/en/latest/specifications/core-metadata/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield parse_metadata( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod @@ -457,7 +461,7 @@ class PyprojectTomlHandler(models.NonAssemblableDatafileHandler): META_DIR_SUFFIXES = '.dist-info', '.egg-info', 'EGG-INFO', -def parse_metadata(location, datasource_id, package_type): +def parse_metadata(location, datasource_id, package_type, purl_only=False): """ Return a PackageData object from a PKG-INFO or METADATA file at ``location`` which is a path string or pathlib.Path-like object (including a possible zip @@ -480,28 +484,34 @@ def parse_metadata(location, datasource_id, package_type): name = get_attribute(meta, 'Name') version = get_attribute(meta, 'Version') - - urls, extra_data = get_urls(metainfo=meta, name=name, version=version) - dependencies = get_dist_dependencies(dist) - file_references = list(get_file_references(dist)) - - return models.PackageData( - datasource_id=datasource_id, - type=package_type, - primary_language='Python', - name=name, - version=version, - extracted_license_statement=get_declared_license(meta), - description=get_description(metainfo=meta, location=str(location)), - keywords=get_keywords(meta), - parties=get_parties(meta), - dependencies=dependencies, - file_references=file_references, - extra_data=extra_data, - **urls, - ) + if purl_only: + return models.PackageData( + datasource_id=datasource_id, + type=package_type, + name=name, + version=version, + dependencies=dependencies, + ) + else: + urls, extra_data = get_urls(metainfo=meta, name=name, version=version) + file_references = list(get_file_references(dist)) + return models.PackageData( + datasource_id=datasource_id, + type=package_type, + primary_language='Python', + name=name, + version=version, + dependencies=dependencies, + extracted_license_statement=get_declared_license(meta), + description=get_description(metainfo=meta, location=str(location)), + keywords=get_keywords(meta), + parties=get_parties(meta), + file_references=file_references, + extra_data=extra_data, + **urls, + ) def urlsafe_b64decode(data): @@ -551,7 +561,7 @@ class PypiWheelHandler(models.DatafileHandler): documentation_url = 'https://peps.python.org/pep-0427/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with zipfile.ZipFile(location) as zf: for path in ZipPath(zf).iterdir(): if not path.name.endswith(META_DIR_SUFFIXES): @@ -564,6 +574,7 @@ def parse(cls, location): location=metapath, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @@ -577,7 +588,7 @@ class PypiEggHandler(models.DatafileHandler): documentation_url = 'https://web.archive.org/web/20210604075235/http://peak.telecommunity.com/DevCenter/PythonEggs' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with zipfile.ZipFile(location) as zf: for path in ZipPath(zf).iterdir(): if not path.name.endswith(META_DIR_SUFFIXES): @@ -591,6 +602,7 @@ def parse(cls, location): location=metapath, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @@ -610,7 +622,7 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # FIXME: add dependencies try: @@ -622,19 +634,27 @@ def parse(cls, location): version = sdist.version urls, extra_data = get_urls(metainfo=sdist, name=name, version=version) - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - primary_language=cls.default_primary_language, - name=name, - version=version, - description=get_description(sdist, location=location), - extracted_license_statement=get_declared_license(sdist), - keywords=get_keywords(sdist), - parties=get_parties(sdist), - extra_data=extra_data, - **urls, - ) + if purl_only: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + ) + else: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + primary_language=cls.default_primary_language, + name=name, + version=version, + description=get_description(sdist, location=location), + extracted_license_statement=get_declared_license(sdist), + keywords=get_keywords(sdist), + parties=get_parties(sdist), + extra_data=extra_data, + **urls, + ) class PythonSetupPyHandler(BaseExtractedPythonLayout): @@ -646,38 +666,46 @@ class PythonSetupPyHandler(BaseExtractedPythonLayout): documentation_url = 'https://docs.python.org/3/distutils/setupscript.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): setup_args = get_setup_py_args(location) # it may be legit to have a name-less package? # in anycase we do not want to fail because of that name = setup_args.get('name') - version = setup_args.get('version') if not version: # search for possible dunder versions here and elsewhere version = detect_version_attribute(location) - urls, extra_data = get_urls(metainfo=setup_args, name=name, version=version) - dependencies = get_setup_py_dependencies(setup_args) - python_requires = get_setup_py_python_requires(setup_args) - extra_data.update(python_requires) - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - primary_language=cls.default_primary_language, - name=name, - version=version, - description=get_description(setup_args), - parties=get_setup_parties(setup_args), - extracted_license_statement=get_declared_license(setup_args), - dependencies=dependencies, - keywords=get_keywords(setup_args), - extra_data=extra_data, - **urls, - ) + if purl_only: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + dependencies=dependencies, + ) + else: + urls, extra_data = get_urls(metainfo=setup_args, name=name, version=version) + python_requires = get_setup_py_python_requires(setup_args) + extra_data.update(python_requires) + + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + primary_language=cls.default_primary_language, + name=name, + version=version, + description=get_description(setup_args), + parties=get_setup_parties(setup_args), + extracted_license_statement=get_declared_license(setup_args), + dependencies=dependencies, + keywords=get_keywords(setup_args), + extra_data=extra_data, + **urls, + ) class ResolvedPurl(NamedTuple): @@ -694,7 +722,7 @@ class BaseDependencyFileHandler(models.DatafileHandler): """ @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): file_name = fileutils.file_name(location) dependency_type = get_dparse2_supported_file_name(file_name) @@ -705,12 +733,19 @@ def parse(cls, location): location=location, file_name=dependency_type, ) - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - primary_language=cls.default_primary_language, - dependencies=dependencies, - ) + if purl_only: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + dependencies=dependencies, + ) + else: + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + primary_language=cls.default_primary_language, + dependencies=dependencies, + ) class SetupCfgHandler(BaseExtractedPythonLayout): @@ -722,7 +757,7 @@ class SetupCfgHandler(BaseExtractedPythonLayout): documentation_url = 'https://peps.python.org/pep-0390/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): metadata = {} parser = ConfigParser() @@ -781,35 +816,45 @@ def parse(cls, location): if not content: continue metadata[name] = content - - parties = [] - author = metadata.get('author') - if author: - parties = [ - models.Party( - type=models.party_person, - name=author, - role='author', - email=metadata.get('author_email'), - ) - ] - - extracted_license_statement = metadata.get('license') - license_file_references = metadata.get('license_files') - if license_file_references: - extracted_license_statement = f"{extracted_license_statement} {license_file_references}" - - yield models.PackageData( + + package = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, name=metadata.get('name'), version=metadata.get('version'), - parties=parties, - homepage_url=metadata.get('url'), - primary_language=cls.default_primary_language, dependencies=dependent_packages, - extracted_license_statement=extracted_license_statement, ) + if purl_only: + yield package + else: + parties = [] + author = metadata.get('author') + if author: + parties = [ + models.Party( + type=models.party_person, + name=author, + role='author', + email=metadata.get('author_email'), + ) + ] + + extracted_license_statement = metadata.get('license') + license_file_references = metadata.get('license_files') + if license_file_references: + extracted_license_statement = f"{extracted_license_statement} {license_file_references}" + + yield models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=metadata.get('name'), + version=metadata.get('version'), + parties=parties, + homepage_url=metadata.get('url'), + primary_language=cls.default_primary_language, + dependencies=dependent_packages, + extracted_license_statement=extracted_license_statement, + ) @classmethod def parse_reqs(cls, reqs, scope): @@ -871,7 +916,7 @@ class PipfileLockHandler(BaseDependencyFileHandler): documentation_url = 'https://github.com/pypa/pipfile' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location) as f: content = f.read() @@ -888,13 +933,17 @@ def parse(cls, location): file_name='Pipfile.lock', ) - yield models.PackageData( + package = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, - sha256=sha256, dependencies=dependent_packages, ) + if purl_only: + yield package + else: + package.primary_language = cls.default_primary_language + package.sha256 = sha256 + yield package class PipRequirementsFileHandler(BaseDependencyFileHandler): @@ -917,15 +966,19 @@ class PipRequirementsFileHandler(BaseDependencyFileHandler): documentation_url = 'https://pip.pypa.io/en/latest/reference/requirements-file-format/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): dependencies, extra_data = get_requirements_txt_dependencies(location=location) - yield models.PackageData( + package = models.PackageData( datasource_id=cls.datasource_id, type=cls.default_package_type, - primary_language=cls.default_primary_language, dependencies=dependencies, - extra_data=extra_data, ) + if purl_only: + yield package + else: + package.primary_language = cls.default_primary_language + package.extra_data = extra_data + yield package # TODO: enable nested load diff --git a/src/packagedcode/readme.py b/src/packagedcode/readme.py index 63741e86648..4cf4a9a8c69 100644 --- a/src/packagedcode/readme.py +++ b/src/packagedcode/readme.py @@ -55,11 +55,14 @@ class ReadmeHandler(models.NonAssemblableDatafileHandler): documentation_url = '' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location, encoding='utf-8') as loc: readme_manifest = loc.read() - package_data = build_package(readme_manifest) + package_data = build_package( + readme_manifest=readme_manifest, + purl_only=purl_only + ) if not package_data.name: # If no name was detected for the Package, then we use the basename @@ -71,7 +74,7 @@ def parse(cls, location): yield package_data -def build_package(readme_manifest): +def build_package(readme_manifest, purl_only=False): """ Return a Package object from a readme_manifest mapping (from a README.chromium file or similar) or None. @@ -102,7 +105,11 @@ def build_package(readme_manifest): package_key = PACKAGE_FIELD_BY_README_FIELD.get(key) if not package_key: continue + if purl_only and package_key not in ["name", "version"]: + continue + setattr(package, package_key, value) - package.populate_license_fields() + if not purl_only: + package.populate_license_fields() return package diff --git a/src/packagedcode/recognize.py b/src/packagedcode/recognize.py index c7e794ecaf6..e0dc315cb30 100644 --- a/src/packagedcode/recognize.py +++ b/src/packagedcode/recognize.py @@ -44,6 +44,7 @@ def recognize_package_data( location, application=True, system=False, + purl_only=False, ): """ Return a list of Package objects if any package_data were recognized for @@ -63,12 +64,19 @@ def recognize_package_data( elif system: datafile_handlers = SYSTEM_PACKAGE_DATAFILE_HANDLERS - return list(_parse(location, datafile_handlers=datafile_handlers)) + return list( + _parse( + location, + datafile_handlers=datafile_handlers, + purl_only=purl_only, + ) + ) def _parse( location, datafile_handlers=APPLICATION_PACKAGE_DATAFILE_HANDLERS, + purl_only=False, ): """ Yield parsed PackageData objects from ``location``. Raises Exceptions on errors. @@ -85,7 +93,7 @@ def _parse( logger_debug(f'_parse:.is_datafile: {location}') try: - for parsed in handler.parse(location): + for parsed in handler.parse(location, purl_only=purl_only): if TRACE: logger_debug(f' _parse: parsed: {parsed!r}') yield parsed diff --git a/src/packagedcode/rpm.py b/src/packagedcode/rpm.py index 01ae1ab850e..cc4e8bccb27 100644 --- a/src/packagedcode/rpm.py +++ b/src/packagedcode/rpm.py @@ -121,7 +121,7 @@ def to_string(self): class BaseRpmInstalledDatabaseHandler(models.DatafileHandler): @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): # we receive the location of the Package database file and we need to # scan the parent which is the directory that contains the rpmdb loc_path = Path(location) @@ -133,6 +133,7 @@ def parse(cls, location): location=xmlish_loc, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) # TODO: package_data.namespace = cls.default_package_namespace return package_data @@ -272,7 +273,7 @@ class RpmArchiveHandler(models.DatafileHandler): documentation_url = 'https://en.wikipedia.org/wiki/RPM_Package_Manager' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): rpm_tags = get_rpm_tags(location, include_desc=True) if TRACE: logger_debug('recognize: rpm_tags', rpm_tags) @@ -319,6 +320,18 @@ def parse(cls, location): if TRACE: logger_debug('recognize: source_rpm', src_purl) source_packages = [src_purl] + package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + # TODO: namespace=cls.default_package_namespace, + name=name, + version=evr, + source_packages=source_packages, + ) + if purl_only: + yield package + return + parties = [] # TODO: also use me to craft a namespace!!! @@ -335,13 +348,13 @@ def parse(cls, location): if rpm_tags.vendor: parties.append(models.Party(name=rpm_tags.vendor, role='vendor')) - description = build_description(summary=rpm_tags.summary, description=rpm_tags.description) + package.description = build_description(summary=rpm_tags.summary, description=rpm_tags.description) if TRACE: data = dict( name=name, version=evr, - description=description or None, + description=package.description or None, homepage_url=rpm_tags.url or None, parties=parties, extracted_license_statement=rpm_tags.license or None, @@ -349,18 +362,9 @@ def parse(cls, location): ) logger_debug('recognize: data to create a package:\n', data) - package = models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - # TODO: namespace=cls.default_package_namespace, - name=name, - version=evr, - description=description or None, - homepage_url=rpm_tags.url or None, - parties=parties, - extracted_license_statement=rpm_tags.license or None, - source_packages=source_packages, - ) + package.homepage_url = rpm_tags.url or None + package.parties = parties + package.extracted_license_statement = rpm_tags.license or None if TRACE: logger_debug('recognize: created package:\n', package) diff --git a/src/packagedcode/rpm_installed.py b/src/packagedcode/rpm_installed.py index 5d01436d2e1..aa66d32c924 100644 --- a/src/packagedcode/rpm_installed.py +++ b/src/packagedcode/rpm_installed.py @@ -36,7 +36,7 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -def parse_rpm_xmlish(location, datasource_id, package_type): +def parse_rpm_xmlish(location, datasource_id, package_type, purl_only=False): """ Yield PackageData built from an RPM XML'ish file at ``location``. This is a file created with the rpm CLI with the xml query option. @@ -58,6 +58,7 @@ def parse_rpm_xmlish(location, datasource_id, package_type): rpm_tags=tags, datasource_id=datasource_id, package_type=package_type, + purl_only=purl_only, ) @@ -133,7 +134,13 @@ def collect_tags(raw_tags): yield name, value_type, value -def build_package(rpm_tags, datasource_id, package_type, package_namespace=None): +def build_package( + rpm_tags, + datasource_id, + package_type, + package_namespace=None, + purl_only=False, +): """ Return a PackageData object from an ``rpm_tags`` iterable of (name, value_type, value) tuples. @@ -147,7 +154,10 @@ def build_package(rpm_tags, datasource_id, package_type, package_namespace=None) } for name, _value_type, value in rpm_tags: - handler = RPM_TAG_HANDLER_BY_NAME.get(name) + handler = RPM_TAG_HANDLER_BY_NAME_PURLS.get(name) + if not handler and not purl_only: + handler = RPM_TAG_HANDLER_BY_NAME_OTHERS.get(name) + # FIXME: we need to handle EVRA correctly # TODO: add more fields # TODO: merge with tag handling in rpm.py @@ -283,7 +293,7 @@ def dirname_handler(value, **kwargs): # PackageData field name -RPM_TAG_HANDLER_BY_NAME = { +RPM_TAG_HANDLER_BY_NAME_PURLS = { ############################################################################ # per-package fields @@ -294,11 +304,15 @@ def dirname_handler(value, **kwargs): # 'Epoch' # 'Release' 11.3.2 'Version': name_value_str_handler('version'), + 'Arch': arch_handler, +} + + +RPM_TAG_HANDLER_BY_NAME_OTHERS = { 'Description': name_value_str_handler('description'), 'Sha1header': name_value_str_handler('sha1'), 'Url': name_value_str_handler('homepage_url'), 'License': name_value_str_handler('extracted_license_statement'), - 'Arch': arch_handler, 'Size': size_handler, # TODO: diff --git a/src/packagedcode/rubygems.py b/src/packagedcode/rubygems.py index 6b77d6945d6..1a5834484a5 100644 --- a/src/packagedcode/rubygems.py +++ b/src/packagedcode/rubygems.py @@ -41,12 +41,13 @@ class GemArchiveHandler(models.DatafileHandler): ) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): metadata = extract_gem_metadata(location) metadata = saneyaml.load(metadata) yield build_rubygem_package_data( gem_data=metadata, datasource_id=cls.datasource_id, + purl_only=purl_only, ) @@ -84,13 +85,14 @@ class GemMetadataArchiveExtractedHandler(models.DatafileHandler): ) @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location, 'rb') as met: metadata = met.read() metadata = saneyaml.load(metadata) yield build_rubygem_package_data( gem_data=metadata, datasource_id=cls.datasource_id, + purl_only=purl_only, ) @classmethod @@ -129,7 +131,7 @@ class GemspecHandler(models.DatafileHandler): documentation_url = 'https://guides.rubygems.org/specification-reference/' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): gemspec = spec.parse_spec( location=location, package_type=cls.default_package_type, @@ -137,6 +139,19 @@ def parse(cls, location): name = gemspec.get('name') version = gemspec.get('version') + dependencies = gemspec.get('dependencies') or [] + + package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + dependencies=dependencies, + ) + if purl_only: + yield package + return + homepage_url = gemspec.get('homepage') description = build_description( @@ -148,24 +163,22 @@ def parse(cls, location): extracted_license_statement = gemspec.get('license') parties = get_parties(gemspec) - dependencies = gemspec.get('dependencies') or [] - urls = get_urls(name=name, version=version) + ( + package.repository_homepage_url, + package.repository_download_url, + package.api_data_url, + package.download_url, + ) = get_urls(name=name, version=version) - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - parties=parties, - homepage_url=homepage_url, - vcs_url=vcs_url, - description=description, - extracted_license_statement=extracted_license_statement, - primary_language=cls.default_primary_language, - dependencies=dependencies, - **urls - ) + package.parties = parties + package.homepage_url = homepage_url + package.vcs_url = vcs_url + package.description = description + package.extracted_license_statement = extracted_license_statement + package.primary_language = cls.default_primary_language + + yield package class GemspecInExtractedGemHandler(GemspecHandler): datasource_id = 'gemspec_extracted' @@ -234,7 +247,7 @@ class GemfileLockHandler(BaseGemProjectHandler): documentation_url = 'https://bundler.io/man/gemfile.5.html' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): gemfile_lock = GemfileLockParser(location) all_gems = list(gemfile_lock.all_gems.values()) if not all_gems: @@ -257,16 +270,25 @@ def parse(cls, location): ) for dep in all_gems if dep != primary_gem ] urls = get_urls(primary_gem.name, primary_gem.version) - - yield models.PackageData( + + package = models.PackageData( datasource_id=cls.datasource_id, - primary_language=cls.default_primary_language, type=cls.default_package_type, name=primary_gem.name, version=primary_gem.version, dependencies=deps, - **urls ) + if purl_only: + yield package + else: + ( + package.repository_homepage_url, + package.repository_download_url, + package.api_data_url, + package.download_url, + ) = urls + package.primary_language = cls.default_primary_language + yield package else: deps = [ models.DependentPackage( @@ -417,7 +439,7 @@ def extract_gem_metadata(location): fileutils.delete(extract_loc) -def build_rubygem_package_data(gem_data, datasource_id): +def build_rubygem_package_data(gem_data, datasource_id, purl_only=False): """ Return a PackageData for ``datasource_id`` built from a Gem `gem_data` mapping or None. The ``gem_data`` can come from a .gemspec or .gem/metadata. @@ -439,6 +461,17 @@ def build_rubygem_package_data(gem_data, datasource_id): else: qualifiers = {} + dependencies = get_dependencies(gem_data.get('dependencies')) + + package_data = models.PackageData( + datasource_id=datasource_id, + type=GemArchiveHandler.default_package_type, + name=name, + version=version, + qualifiers=qualifiers, + dependencies=dependencies, + ) + description = build_description( summary=gem_data.get('summary'), description=gem_data.get('description'), @@ -457,26 +490,21 @@ def build_rubygem_package_data(gem_data, datasource_id): if not homepage_url: homepage_url = gem_data.get('homepage') - urls = get_urls(name, version, platform) - dependencies = get_dependencies(gem_data.get('dependencies')) + ( + package_data.repository_homepage_url, + package_data.repository_download_url, + package_data.api_data_url, + package_data.download_url, + ) = get_urls(name, version, platform) file_references = get_file_references(metadata.get('files')) - package_data = models.PackageData( - datasource_id=datasource_id, - type=GemArchiveHandler.default_package_type, - primary_language=GemArchiveHandler.default_primary_language, - name=name, - version=version, - qualifiers=qualifiers, - description=description, - homepage_url=homepage_url, - extracted_license_statement=extracted_license_statement, - bug_tracking_url=metadata.get('bug_tracking_uri'), - code_view_url=metadata.get('source_code_uri'), - file_references=file_references, - dependencies=dependencies, - **urls, - ) + package_data.primary_language = GemArchiveHandler.default_primary_language + package_data.description = description + package_data.homepage_url = homepage_url + package_data.extracted_license_statement = extracted_license_statement + package_data.bug_tracking_url = metadata.get('bug_tracking_uri') + package_data.code_view_url = metadata.get('source_code_uri') + package_data.file_references = file_references # we can have one singular or a plural list of authors authors = gem_data.get('authors') or [] diff --git a/src/packagedcode/spec.py b/src/packagedcode/spec.py index c0bf7083c51..49762a3776d 100644 --- a/src/packagedcode/spec.py +++ b/src/packagedcode/spec.py @@ -120,9 +120,12 @@ def get_authors(line): # mapping of parser callable by its field name -PARSER_BY_NAME = { +PARSER_BY_NAME_PURL_ONLY = { 'name': partial(get_value, name='name', matcher=parse_name), 'version': partial(get_value, name='version', matcher=parse_version), +} + +PARSER_BY_NAME_OTHERS = { 'license': partial(get_value, name='license', matcher=parse_license), 'summary': partial(get_value, name='summary', matcher=parse_summary), 'description': partial(get_value, name='description', matcher=parse_description, clean=False), @@ -133,7 +136,7 @@ def get_authors(line): } -def parse_spec(location, package_type): +def parse_spec(location, package_type, purl_only=False): """ Return a mapping of data parsed from a podspec/gemspec/Pofile/Gemfile file at ``location``. Use ``package_type`` a Package URL type for dependencies. @@ -147,10 +150,27 @@ def parse_spec(location, package_type): for line in lines: line = pre_process(line) - for attribute_name, parser in PARSER_BY_NAME.items(): + for attribute_name, parser in PARSER_BY_NAME_PURL_ONLY.items(): parsed = parser(line=line) if parsed: spec_data[attribute_name] = parsed + + if not purl_only: + for attribute_name, parser in PARSER_BY_NAME_PURL_ONLY.items(): + parsed = parser(line=line) + if parsed: + spec_data[attribute_name] = parsed + + # We avoid reloading twice the file but we are still parsing twice: need to + # merge all in gemfileparser or write a better parser. + spec_data['dependencies'] = list(get_dependent_packages( + lines=lines, + location=location, + package_type=package_type, + )) + + if purl_only: + return spec_data # description can be in single or multi-lines # There are many different ways to write description. @@ -167,14 +187,6 @@ def parse_spec(location, package_type): # a single quoted description spec_data['description'] = get_cleaned_string(description) - # We avoid reloading twice the file but we are still parsing twice: need to - # merge all in gemfileparser or write a better parser. - spec_data['dependencies'] = list(get_dependent_packages( - lines=lines, - location=location, - package_type=package_type, - )) - return spec_data diff --git a/src/packagedcode/win_pe.py b/src/packagedcode/win_pe.py index a51ceb9496e..ce077ce12ca 100644 --- a/src/packagedcode/win_pe.py +++ b/src/packagedcode/win_pe.py @@ -276,7 +276,7 @@ def is_datafile(cls, location, filetypes=tuple()): return True @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): infos = pe_info(location) version = get_first( @@ -298,6 +298,16 @@ def parse(cls, location): 'OriginalFilename', 'InternalName', ) + package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=name, + version=version, + ) + if purl_only: + yield package + return + copyr = get_first(infos, 'LegalCopyright') LegalCopyright = copyr, @@ -328,15 +338,10 @@ def parse(cls, location): parties = [Party(type=party_org, role='author', name=cname)] homepage_url = get_first(infos, 'URL', 'WWW') - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - release_date=release_date, - copyright=copyr, - extracted_license_statement=extracted_license_statement, - description=description, - parties=parties, - homepage_url=homepage_url, - ) + package.release_date = release_date + package.copyright = copyr + package.extracted_license_statement = extracted_license_statement + package.description = description + package.parties = parties + package.homepage_url = homepage_url + yield package diff --git a/src/packagedcode/win_reg.py b/src/packagedcode/win_reg.py index e4e59cd3b1e..7a6902ce961 100644 --- a/src/packagedcode/win_reg.py +++ b/src/packagedcode/win_reg.py @@ -77,6 +77,7 @@ def get_installed_dotnet_versions_from_hive( datasource_id, package_type, registry_path='\\Microsoft\\NET Framework Setup\\NDP', + purl_only=False, ): """ Yield PackageData for the installed versions of .NET framework from the @@ -90,6 +91,7 @@ def get_installed_dotnet_versions_from_hive( registry_tree=registry_tree, datasource_id=datasource_id, package_type=package_type, + purl_only=purl_only, ) @@ -97,6 +99,7 @@ def get_installed_dotnet_versions_from_regtree( registry_tree, datasource_id, package_type, + purl_only=False, ): """ Yield PackageData for the installed versions of .NET framework from a @@ -111,6 +114,16 @@ def get_installed_dotnet_versions_from_regtree( if not entry.get('path', '').endswith('\\Full'): continue + package = models.PackageData( + datasource_id=datasource_id, + type=package_type, + name='microsoft-dot-net-framework', + version=version, + ) + if purl_only: + yield package + return + file_references = [] version = None for values in entry.get('values', []): @@ -122,13 +135,8 @@ def get_installed_dotnet_versions_from_regtree( if key == 'InstallPath': file_references.append(models.FileReference(path=value)) - yield models.PackageData( - datasource_id=datasource_id, - type=package_type, - name='microsoft-dot-net-framework', - version=version, - file_references=file_references, - ) + package.file_references = file_references + yield package def get_installed_windows_programs_from_hive( @@ -136,6 +144,7 @@ def get_installed_windows_programs_from_hive( datasource_id, package_type, registry_path='\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + purl_only=False, ): """ Yield installed Windows PackageData from a Windows registry file at @@ -151,6 +160,7 @@ def get_installed_windows_programs_from_hive( registry_tree=registry_tree, datasource_id=datasource_id, package_type=package_type, + purl_only=purl_only, ) @@ -158,6 +168,7 @@ def get_installed_windows_programs_from_regtree( registry_tree, datasource_id, package_type, + purl_only=False, ): """ Yield installed Windows PackageData from a Windows ``registry_tree``. @@ -187,6 +198,16 @@ def get_installed_windows_programs_from_regtree( name = package_info.get('name') version = package_info.get('version') + package = models.PackageData( + datasource_id=datasource_id, + type=package_type, + name=name, + version=version, + ) + if purl_only: + yield package + return + homepage_url = package_info.get('homepage_url') publisher = package_info.get('publisher') @@ -213,21 +234,17 @@ def get_installed_windows_programs_from_regtree( if uninstall_string: file_references.append(models.FileReference(path=uninstall_string)) - yield models.PackageData( - datasource_id=datasource_id, - type=package_type, - name=name, - version=version, - parties=parties, - homepage_url=homepage_url, - file_references=file_references, - ) + package.parties = parties + package.homepage_url = homepage_url + package.file_references = file_references + yield package def get_packages_from_registry_from_hive( location, datasource_id, package_type, + purl_only=False, ): """ Yield PackageData for Installed Windows Programs from the Windows registry @@ -238,6 +255,7 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + purl_only=purl_only, ) yield from get_installed_windows_programs_from_hive( @@ -245,6 +263,7 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Wow6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall', + purl_only=purl_only, ) yield from get_installed_dotnet_versions_from_hive( @@ -252,6 +271,7 @@ def get_packages_from_registry_from_hive( datasource_id=datasource_id, package_type=package_type, registry_path='\\Microsoft\\NET Framework Setup\\NDP', + purl_only=purl_only, ) @@ -342,11 +362,12 @@ class BaseRegInstalledProgramHandler(models.DatafileHandler): root_path_relative_to_datafile_path = None @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): yield from get_packages_from_registry_from_hive( location=location, datasource_id=cls.datasource_id, package_type=cls.default_package_type, + purl_only=purl_only, ) @classmethod diff --git a/src/packagedcode/windows.py b/src/packagedcode/windows.py index 09d62f4c2d9..bc03f04b62d 100644 --- a/src/packagedcode/windows.py +++ b/src/packagedcode/windows.py @@ -20,22 +20,30 @@ class MicrosoftUpdateManifestHandler(models.NonAssemblableDatafileHandler): description = 'Microsoft Update Manifest .mum file' @classmethod - def parse(cls, location): + def parse(cls, location, purl_only=False): with open(location , 'rb') as loc: parsed = xmltodict.parse(loc) if not parsed: return + package = models.PackageData( + datasource_id=cls.datasource_id, + type=cls.default_package_type, + name=assembly_identity.get('@name', ''), + version=assembly_identity.get('@version', ''), + ) + if purl_only: + yield package + return + assembly = parsed.get('assembly', {}) - description = assembly.get('@description', '') + package.description = assembly.get('@description', '') company = assembly.get('@company', '') - copyrght = assembly.get('@copyright', '') - support_url = assembly.get('@supportInformation', '') + package.copyrght = assembly.get('@copyright', '') + package.homepage_url = assembly.get('@supportInformation', '') assembly_identity = assembly.get('assemblyIdentity', {}) - name = assembly_identity.get('@name', '') - version = assembly_identity.get('@version', '') parties = [] if company: @@ -46,14 +54,5 @@ def parse(cls, location): role='owner', ) ) - - yield models.PackageData( - datasource_id=cls.datasource_id, - type=cls.default_package_type, - name=name, - version=version, - description=description, - homepage_url=support_url, - parties=parties, - copyright=copyrght, - ) + package.parties = parties + yield package diff --git a/src/scancode/api.py b/src/scancode/api.py index 7d3edbf1516..7c866aeed0c 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -248,7 +248,13 @@ def get_licenses( SCANCODE_DEBUG_PACKAGE_API = os.environ.get('SCANCODE_DEBUG_PACKAGE_API', False) -def _get_package_data(location, application=True, system=False, **kwargs): +def _get_package_data( + location, + application=True, + system=False, + purl_only=False, + **kwargs + ): """ Return a mapping of package manifest information detected in the file at ``location``. Include ``application`` packages (such as pypi) and/or ``system`` packages. @@ -261,7 +267,8 @@ def _get_package_data(location, application=True, system=False, **kwargs): return recognize_package_data( location=location, application=application, - system=system + system=system, + purl_only=purl_only, ) or [] except Exception as e: @@ -291,7 +298,13 @@ def get_package_info(location, **kwargs): return dict(packages=[p.to_dict() for p in packages]) -def get_package_data(location, application=True, system=False, **kwargs): +def get_package_data( + location, + application=True, + system=False, + purl_only=False, + **kwargs + ): """ Return a mapping of package manifest information detected in the file at `location`. @@ -304,6 +317,7 @@ def get_package_data(location, application=True, system=False, **kwargs): location=location, application=application, system=system, + purl_only=purl_only, **kwargs, ) or []