From 4eebf469e60e2900f91e0e4f2553983fbd56a6b4 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 22 Feb 2024 21:32:50 +0530 Subject: [PATCH] Implement initial debian purl2meta implementation Reference: https://github.com/nexB/purldb/issues/245 Signed-off-by: Ayan Sinha Mahapatra --- minecode/utils.py | 38 ++++ minecode/visitors/debian.py | 341 +++++++++++++++++++++++++++++++++++- minecode/visitors/maven.py | 41 +---- packagedb/api.py | 13 +- packagedb/models.py | 12 +- 5 files changed, 398 insertions(+), 47 deletions(-) diff --git a/minecode/utils.py b/minecode/utils.py index 1bcd023b..40fe0146 100644 --- a/minecode/utils.py +++ b/minecode/utils.py @@ -238,6 +238,44 @@ def get_http_response(uri, timeout=10): return response +def get_package_sha1(package): + """ + Return the sha1 value for `package` by checking if the sha1 file exists for + `package` on maven and returning the contents if it does. + + If the sha1 is invalid, we download the package's JAR and calculate the sha1 + from that. + """ + download_url = package.repository_download_url + sha1_download_url = f'{download_url}.sha1' + response = requests.get(sha1_download_url) + if response.ok: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + if not sha1: + # Download JAR and calculate sha1 if we cannot get it from the repo + response = requests.get(download_url) + if response: + sha1_hash = hashlib.new('sha1', response.content) + sha1 = sha1_hash.hexdigest() + return sha1 + + +def validate_sha1(sha1): + """ + Validate a `sha1` string. + + Return `sha1` if it is valid, None otherwise. + """ + if sha1 and len(sha1) != 40: + logger.warning( + f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!' + ) + sha1 = None + return sha1 + + def system_temp_dir(temp_dir=os.getenv('MINECODE_TMP')): """ Return the global temp directory.. diff --git a/minecode/visitors/debian.py b/minecode/visitors/debian.py index 30870f67..f7787f06 100644 --- a/minecode/visitors/debian.py +++ b/minecode/visitors/debian.py @@ -8,22 +8,34 @@ # +import attr import gzip import json import logging +import requests from commoncode import fileutils import debian_inspector from debian_inspector import debcon from debian_inspector import copyright as debcopy +from debian_inspector.version import Version as DebVersion +from packagedcode.models import PackageData +from packagedcode.debian import DebianDscFileHandler from packageurl import PackageURL from minecode import ls from minecode import seed from minecode import visit_router +from minecode import priority_router from minecode.visitors import HttpVisitor from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI +from minecode.utils import get_temp_dir +from minecode.utils import get_temp_file +from minecode.utils import get_package_sha1 +from packagedb.models import make_relationship +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -38,8 +50,10 @@ """ -# DEBIAN_BASE_URL = 'http://ftp.debian.org/debian/' +DEBIAN_BASE_URL = "http://deb.debian.org/debian/pool/main/" +DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/" # Other URLs and sources to consider +# 'http://ftp.debian.org/debian/' # rsync://archive.debian.org/debian-archive # http://sources.debian.net/doc/api/ # Packages.gz files: https://get.videolan.org/debian/i386/Packages.gz @@ -298,3 +312,328 @@ class DebianReleaseVisitor(HttpVisitor): Collect Release file content from a Release data file. """ pass + + +@priority_router.route('pkg:deb/.*') +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from debian and + using it to create a new PackageDB entry. The binary package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + source_package_url = kwargs.get("source_purl", None) + try: + package_url = PackageURL.from_string(purl_str) + + except ValueError as e: + error = f'error occured when parsing {purl_str}: {e}' + return error + + has_version = bool(package_url.version) + if has_version: + error = map_debian_metadata_binary_and_source( + package_url=package_url, + source_package_url=source_package_url + ) + + return error + + +def map_debian_package(debian_package, package_content): + """ + Add a debian `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + db_package = None + error = '' + + if package_content == PackageContentType.BINARY: + purl = debian_package.package_url + download_url = debian_package.binary_archive_url + elif package_content == PackageContentType.SOURCE_ARCHIVE: + purl = debian_package.source_package_url + download_url = debian_package.source_archive_url + + response = requests.get(download_url) + if not response.ok: + msg = f'Package metadata not exist on debian: {download_url}' + error += msg + '\n' + logger.error(msg) + return db_package, error + + package = PackageData( + type=purl.type, + namespace=purl.namespace, + name=purl.name, + version=purl.version, + qualifiers=purl.qualifiers, + download_url=download_url, + ) + + # Set package_content value + package.extra_data['package_content'] = package_content + + # If sha1 exists for an archive, we know we can create the package + # Use purl info as base and create packages for binary and source package + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f'Failed to retrieve JAR: {purl.to_string()} from url: {download_url}' + error += msg + '\n' + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return db_package, error + + +def map_debian_package_metadata(debian_package, package_content): + """ + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + db_package = None + error = '' + + metadata_url = debian_package.package_metadata_url + response = requests.get(metadata_url) + if not response.ok: + msg = f'Package metadata not exist on debian: {metadata_url}' + error += msg + '\n' + logger.error(msg) + return db_package, error + + metadata_content = response.text + filename = metadata_url.split("/")[-1] + file_name, _, extension = filename.rpartition(".") + temp_metadata_file = get_temp_file(file_name=file_name, extension=extension) + + package = DebianDscFileHandler.parse(location=temp_metadata_file) + + # In the case of looking up a maven package with qualifiers of + # `classifiers=sources`, the purl of the package created from the pom does + # not have the qualifiers, so we need to set them. Additionally, the download + # url is not properly generated since it would be missing the sources bit + # from the filename. + package.qualifiers = debian_package.package_url.qualifiers + + # Set package_content value + package.extra_data['package_content'] = package_content + + # If sha1 exists for a jar, we know we can create the package + # Use pom info as base and create packages for binary and source package + + # Check to see if binary is available + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f'Failed to retrieve JAR: {debian_package.package_url}' + error += msg + '\n' + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package) + + return db_package, error + + +def map_debian_metadata_binary_and_source(package_url, source_package_url): + """ + Get metadata for the binary and source release of the Debain package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + else: + base_url = DEBIAN_BASE_URL + + if "api_data_url" in package_url.qualifiers: + metadata_base_url = package_url.qualifiers["api_data_url"] + else: + metadata_base_url = DEBIAN_METADATA_URL + + debian_package = DebianPackage( + package_url=package_url, + source_package_url=source_package_url, + archive_base_url=base_url, + metadata_base_url=metadata_base_url, + ) + + error = '' + metadata_package, emsg = map_debian_package_metadata( + debian_package, + PackageContentType.METADATA, + ) + if emsg: + error += emsg + + binary_package, emsg = map_debian_package( + debian_package, + PackageContentType.BINARY, + ) + if emsg: + error += emsg + + package_url.qualifiers['classifier'] = 'sources' + source_package, emsg = map_debian_package( + debian_package, + PackageContentType.SOURCE_ARCHIVE, + ) + if emsg: + error += emsg + + if metadata_package and binary_package: + make_relationship( + from_package=metadata_package, + to_package=binary_package, + relationship=PackageRelation.Relationship.BINARY_PACKAGE, + ) + + if metadata_package and source_package: + make_relationship( + from_package=metadata_package, + to_package=source_package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +@attr.s +class DebianPackage: + + archive_base_url = attr.ib(type=str) + metadata_base_url = attr.ib(type=str) + package_url = attr.ib(type=str) + source_package_url = attr.ib(type=str) + metadata_directory_url = attr.ib(type=str) + archive_directory_url = attr.ib(type=str) + + def __attrs_post_init__(self, *args, **kwargs): + self.set_debian_archive_directory() + + @property + def package_archive_version(self): + """ + """ + debvers = DebVersion.from_string(self.package_url.version) + if debvers.revision != "0": + purl_version = f"{debvers.upstream}-{debvers.revision}" + else: + purl_version = debvers.upstream + return purl_version + + @property + def binary_archive_url(self): + """ + """ + purl_version = self.package_archive_version + arch = self.package_url.qualifiers.get("architecture") + if arch: + archive_name =f"{self.package_url.name}_{purl_version}_{arch}.deb" + else: + archive_name =f"{self.package_url.name}_{purl_version}.deb" + binary_package_url = self.archive_directory_url + f"{archive_name}" + return binary_package_url + + @property + def source_archive_url(self): + """ + """ + debian_source_archive_formats = [ + ".tar.xz", ".tar.gz", ".orig.tar.xz", ".orig.tar.gz", ".orig.tar.bz2" + ] + + source_version = self.package_archive_version + if not self.source_package_url: + source_package_name = self.package_url.name + else: + source_package_name = self.source_package_url.name + if self.source_package_url.version: + source_version = self.source_package_url.version + + for archive_format in debian_source_archive_formats: + if ".orig" in archive_format: + base_version_source = source_version.split('-')[0] + archive_name = f"{source_package_name}_{base_version_source}" + archive_format + else: + archive_name = f"{source_package_name}_{source_version}" + archive_format + source_package_url = self.archive_directory_url + archive_name + response = requests.get(source_package_url) + if response.ok: + break + + return source_package_url + + @property + def package_metadata_url(self): + metadata_version = self.package_archive_version + if not self.source_package_url: + metadata_package_name = self.package_url.name + else: + metadata_package_name = self.source_package_url.name + if self.source_package_url.version: + metadata_version = self.source_package_url.version + + base_version_metadata = metadata_version.split('+')[0] + metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{base_version_metadata}.dsc" + response = requests.get(metadata_dsc_package_url) + if not response.ok: + metadata_dsc_package_url = self.archive_directory_url + f"{metadata_package_name}_{metadata_version}.dsc" + + return metadata_dsc_package_url + + def set_debian_archive_directory(self): + """ + """ + base_url = self.archive_base_url + index_folder = None + if self.package_url.name.startswith('lib'): + name_wout_lib = self.package_url.name.replace("lib", "") + index_folder = 'lib' + name_wout_lib[0] + else: + index_folder = self.package_url.name[0] + + msg = "No directory exists for package at: " + + package_directory = f"{base_url}{index_folder}/{self.package_url.name}/" + response = requests.get(package_directory) + if not response.ok: + if not self.source_package_url: + raise PackageDirectoryMissingException(msg + str(package_directory)) + if self.source_package_url.name.startswith('lib'): + name_wout_lib = self.source_package_url.name.replace("lib", "") + index_folder = 'lib' + name_wout_lib[0] + else: + index_folder = self.source_package_url.name[0] + package_directory = f"{base_url}{index_folder}/{self.source_package_url.name}/" + response = requests.get(package_directory) + if not response.ok: + raise PackageDirectoryMissingException(msg + str(package_directory)) + + self.archive_directory_url = package_directory + + +class PackageDirectoryMissingException(Exception): + pass diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 55624772..a6940015 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -9,7 +9,6 @@ from collections import namedtuple import gzip -import hashlib import io import json import logging @@ -40,6 +39,8 @@ from minecode.visitors import HttpVisitor from minecode.visitors import NonPersistentHttpVisitor from minecode.visitors import URI +from minecode.utils import get_package_sha1 +from minecode.utils import validate_sha1 from packagedb.models import make_relationship from packagedb.models import PackageContentType from packagedb.models import PackageRelation @@ -133,30 +134,6 @@ def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_UR return response.text -def get_package_sha1(package): - """ - Return the sha1 value for `package` by checking if the sha1 file exists for - `package` on maven and returning the contents if it does. - - If the sha1 is invalid, we download the package's JAR and calculate the sha1 - from that. - """ - download_url = package.repository_download_url - sha1_download_url = f'{download_url}.sha1' - response = requests.get(sha1_download_url) - if response.ok: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - if not sha1: - # Download JAR and calculate sha1 if we cannot get it from the repo - response = requests.get(download_url) - if response: - sha1_hash = hashlib.new('sha1', response.content) - sha1 = sha1_hash.hexdigest() - return sha1 - - def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): """ Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. @@ -348,20 +325,6 @@ def map_maven_package(package_url, package_content): return db_package, error -def validate_sha1(sha1): - """ - Validate a `sha1` string. - - Return `sha1` if it is valid, None otherwise. - """ - if sha1 and len(sha1) != 40: - logger.warning( - f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!' - ) - sha1 = None - return sha1 - - def map_maven_binary_and_source(package_url): """ Get metadata for the binary and source release of the Maven package diff --git a/packagedb/api.py b/packagedb/api.py index d11be44d..5cbdfdf2 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -584,15 +584,21 @@ class CollectViewSet(viewsets.ViewSet): """ serializer_class=None @extend_schema( - parameters=[OpenApiParameter('purl', str, 'query', description='PackageURL')], + parameters=[ + OpenApiParameter('purl', str, 'query', description='PackageURL'), + OpenApiParameter('source_purl', str, 'query', description='Source PackageURL', default=False), + ], responses={200:PackageAPISerializer()}, ) def list(self, request, format=None): purl = request.query_params.get('purl') + source_purl = request.query_params.get('source_purl', None) # validate purl try: package_url = PackageURL.from_string(purl) + if source_purl: + source_package_url = PackageURL.from_string(source_purl) except ValueError as e: message = { 'status': f'purl validation error: {e}' @@ -603,7 +609,10 @@ def list(self, request, format=None): packages = Package.objects.filter(**lookups) if packages.count() == 0: try: - errors = priority_router.process(purl) + kwargs = dict() + if source_purl: + kwargs["source_purl"] = source_purl + errors = priority_router.process(purl, **kwargs) except NoRouteAvailable: message = { 'status': f'cannot fetch Package data for {purl}: no available handler' diff --git a/packagedb/models.py b/packagedb/models.py index 35a35e9a..a4945e61 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -456,11 +456,12 @@ class PackageContentType(models.IntegerChoices): # fields matches with the current package CURATION = 1, 'curation' PATCH = 2, 'patch' - SOURCE_REPO = 3, 'source_repo' - SOURCE_ARCHIVE = 4, 'source_archive' - BINARY = 5, 'binary' - TEST = 6, 'test' - DOC = 7, 'doc' + METADATA = 3, 'metadata' + SOURCE_REPO = 4, 'source_repo' + SOURCE_ARCHIVE = 5, 'source_archive' + BINARY = 6, 'binary' + TEST = 7, 'test' + DOC = 8, 'doc' def get_class_name(obj): @@ -1183,6 +1184,7 @@ class PackageRelation(models.Model): class Relationship(models.TextChoices): SOURCE_PACKAGE = "source_package" + BINARY_PACKAGE = "binary_package" from_package = models.ForeignKey( Package,