Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add composite unique on name and version #88

Merged
merged 4 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Add composite unique constraint to name and version

Revision ID: 9d70bac89aa5
Revises: 095a3873bec8
Create Date: 2023-06-27 16:21:50.089192

"""

from alembic import op

# revision identifiers, used by Alembic.
revision = "9d70bac89aa5"
down_revision = "095a3873bec8"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_unique_constraint("name_version_unique", "scans", ["name", "version"])
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_constraint("name_version_unique", "scans", type_="unique")
# ### end Alembic commands ###
69 changes: 25 additions & 44 deletions src/mainframe/endpoints/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@

import structlog
from fastapi import APIRouter, Depends, HTTPException, Request
from letsbuilda.pypi import PackageMetadata, PyPIServices
from sqlalchemy import select, tuple_
from letsbuilda.pypi import PyPIServices
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import selectinload

Expand All @@ -13,7 +14,6 @@
from mainframe.json_web_token import AuthenticationData
from mainframe.models.orm import DownloadURL, Rule, Scan, Status
from mainframe.models.schemas import (
BatchPackageQueueErr,
Error,
PackageScanResult,
PackageScanResultFail,
Expand Down Expand Up @@ -173,57 +173,40 @@ async def lookup_package_info(
},
)
async def batch_queue_package(
packages: set[PackageSpecifier],
packages: list[PackageSpecifier],
session: Annotated[AsyncSession, Depends(get_db)],
auth: Annotated[AuthenticationData, Depends(validate_token)],
request: Request,
) -> list[BatchPackageQueueErr]:
ok_packages: dict[tuple[str, str], PackageMetadata] = {}
err_packages: dict[tuple[str, str | None], str] = {}

):
pypi_client: PyPIServices = request.app.state.pypi_client
rows: list[Scan] = []

# This step filters out packages that are not even on PyPI
for package in packages:
name = package.name
version = package.version

try:
package_metadata = await pypi_client.get_package_metadata(name, version)
ok_packages[(package_metadata.info.name, package_metadata.info.version)] = package_metadata
except KeyError:
err_packages[(name, version)] = f"Package {name}@{version} was not found on PyPI"

query = select(Scan).where(tuple_(Scan.name, Scan.version).in_(ok_packages))
rows = await session.scalars(query)

# This step filters out packages that are already in the database
for row in rows:
name = row.name
version = row.version
t = (name, version)
continue

ok_packages.pop(t)
err_packages[t] = f"Package {name}@{version} is already queued for scanning"

new_packages = [
Scan(
name=metadata.info.name,
version=metadata.info.version,
scan = Scan(
name=package_metadata.info.name,
version=package_metadata.info.version,
status=Status.QUEUED,
queued_by=auth.subject,
download_urls=[DownloadURL(url=url.url) for url in metadata.urls],
download_urls=[DownloadURL(url=url.url) for url in package_metadata.urls],
)
for metadata in ok_packages.values()
]

session.add_all(new_packages)
await session.commit()
rows.append(scan)

return [
BatchPackageQueueErr(name=name, version=version, detail=detail)
for ((name, version), detail) in err_packages.items()
]
async with session.begin():
for row in rows:
try:
async with session.begin_nested():
session.add(row)
except IntegrityError:
pass
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not logging this? or returning errors?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't figure it was necessary. Attempting to scan duplicate packages is going to be very common - after all the mechanism that's going to be reading from the RSS feed and loading it into the database doesn't do any of the checking. 90-95% of packages batch queued will already be queued for scanning in the database



@router.post(
Expand Down Expand Up @@ -268,13 +251,6 @@ async def queue_package(
version = package_metadata.info.version # Use latest version if not provided
log = logger.bind(package={"name": name, "version": version})

query = select(Scan).where(Scan.name == name).where(Scan.version == version)
row = await session.scalar(query)

if row is not None:
await log.info(f"Package {name}@{version} already queued for scanning.", tag="already_queued")
raise HTTPException(409, f"Package {name}@{version} is already queued for scanning")

new_package = Scan(
name=name,
version=version,
Expand All @@ -284,7 +260,12 @@ async def queue_package(
)

session.add(new_package)
await session.commit()

try:
await session.commit()
except IntegrityError:
await log.warn(f"Package {name}@{version} already queued for scanning.", tag="already_queued")
raise HTTPException(409, f"Package {name}@{version} is already queued for scanning")

await log.ainfo(
"Added new package",
Expand Down
10 changes: 9 additions & 1 deletion src/mainframe/models/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
from enum import Enum
from typing import Optional

from sqlalchemy import Column, DateTime, FetchedValue, ForeignKey, Table
from sqlalchemy import (
Column,
DateTime,
FetchedValue,
ForeignKey,
Table,
UniqueConstraint,
)
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.ext.associationproxy import AssociationProxy, association_proxy
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
Expand Down Expand Up @@ -43,6 +50,7 @@ class Scan(Base):
"""The scans."""

__tablename__: str = "scans"
__table_args__ = (UniqueConstraint("name", "version"),)

scan_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
Expand Down
4 changes: 0 additions & 4 deletions src/mainframe/models/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,3 @@ class QueuePackageResponse(BaseModel):
"""Returned after queueing a package. Contains the UUID"""

id: str


class BatchPackageQueueErr(PackageSpecifier, Error):
pass