Skip to content

Commit

Permalink
refactor: storage related models are in base storage client
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Aug 26, 2024
1 parent 1ad2af6 commit 3b6e581
Show file tree
Hide file tree
Showing 38 changed files with 346 additions and 292 deletions.
2 changes: 1 addition & 1 deletion src/crawlee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from importlib import metadata

from ._models import Request
from ._request import Request
from ._types import ConcurrencySettings, EnqueueStrategy
from ._utils.globs import Glob

Expand Down
246 changes: 16 additions & 230 deletions src/crawlee/_models.py → src/crawlee/_request.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
# ruff: noqa: TCH001, TCH002, TCH003
# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic)

from __future__ import annotations

from datetime import datetime
from decimal import Decimal
from enum import Enum
from typing import Annotated, Any, Generic
from typing import Annotated, Any

from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
from typing_extensions import Self, TypeVar
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpMethod
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url


class RequestState(Enum):
"""Crawlee-specific request handling state."""

UNPROCESSED = 0
BEFORE_NAV = 1
AFTER_NAV = 2
REQUEST_HANDLER = 3
DONE = 4
ERROR_HANDLER = 5
ERROR = 6
SKIPPED = 7


class BaseRequestData(BaseModel):
"""Data needed to create a new crawling request."""

Expand Down Expand Up @@ -205,19 +218,6 @@ class RequestWithLock(Request):
lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]


class RequestState(Enum):
"""Crawlee-specific request handling state."""

UNPROCESSED = 0
BEFORE_NAV = 1
AFTER_NAV = 2
REQUEST_HANDLER = 3
DONE = 4
ERROR_HANDLER = 5
ERROR = 6
SKIPPED = 7


class CrawleeRequestData(BaseModel):
"""Crawlee-specific configuration stored in the user_data."""

Expand All @@ -237,217 +237,3 @@ class CrawleeRequestData(BaseModel):
last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None

forefront: Annotated[bool, Field()] = False


class BaseStorageMetadata(BaseModel):
"""Base model for storage metadata."""

model_config = ConfigDict(populate_by_name=True)

id: Annotated[str, Field(alias='id')]
name: Annotated[str | None, Field(alias='name', default='')]
accessed_at: Annotated[datetime, Field(alias='accessedAt')]
created_at: Annotated[datetime, Field(alias='createdAt')]
modified_at: Annotated[datetime, Field(alias='modifiedAt')]


class DatasetMetadata(BaseStorageMetadata):
"""Model for a dataset metadata."""

model_config = ConfigDict(populate_by_name=True)

item_count: Annotated[int, Field(alias='itemCount')]


class KeyValueStoreMetadata(BaseStorageMetadata):
"""Model for a key-value store metadata."""

model_config = ConfigDict(populate_by_name=True)

user_id: Annotated[str, Field(alias='userId')]


class RequestQueueMetadata(BaseStorageMetadata):
"""Model for a request queue metadata."""

model_config = ConfigDict(populate_by_name=True)

had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
handled_request_count: Annotated[int, Field(alias='handledRequestCount')]
pending_request_count: Annotated[int, Field(alias='pendingRequestCount')]
stats: Annotated[dict, Field(alias='stats')]
total_request_count: Annotated[int, Field(alias='totalRequestCount')]
user_id: Annotated[str, Field(alias='userId')]
resource_directory: Annotated[str, Field(alias='resourceDirectory')]


ValueType = TypeVar('ValueType', default=Any)


class KeyValueStoreRecord(BaseModel, Generic[ValueType]):
"""Model for a key-value store record."""

model_config = ConfigDict(populate_by_name=True)

key: Annotated[str, Field(alias='key')]
value: Annotated[ValueType, Field(alias='value')]
content_type: Annotated[str | None, Field(alias='contentType', default=None)]
filename: Annotated[str | None, Field(alias='filename', default=None)]


class KeyValueStoreRecordMetadata(BaseModel):
"""Model for a key-value store record metadata."""

model_config = ConfigDict(populate_by_name=True)

key: Annotated[str, Field(alias='key')]
content_type: Annotated[str, Field(alias='contentType')]


class KeyValueStoreKeyInfo(BaseModel):
"""Model for a key-value store key info."""

model_config = ConfigDict(populate_by_name=True)

key: Annotated[str, Field(alias='key')]
size: Annotated[int, Field(alias='size')]


class KeyValueStoreListKeysPage(BaseModel):
"""Model for listing keys in the key-value store."""

model_config = ConfigDict(populate_by_name=True)

count: Annotated[int, Field(alias='count')]
limit: Annotated[int, Field(alias='limit')]
is_truncated: Annotated[bool, Field(alias='isTruncated')]
items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)]
exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)]
next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)]


class RequestQueueHeadState(BaseModel):
"""Model for the request queue head state."""

model_config = ConfigDict(populate_by_name=True)

was_limit_reached: Annotated[bool, Field(alias='wasLimitReached')]
prev_limit: Annotated[int, Field(alias='prevLimit')]
queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')]
query_started_at: Annotated[datetime, Field(alias='queryStartedAt')]
had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]


class RequestQueueHead(BaseModel):
"""Model for the request queue head."""

model_config = ConfigDict(populate_by_name=True)

limit: Annotated[int | None, Field(alias='limit', default=None)]
had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')]
queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')]
items: Annotated[list[Request], Field(alias='items', default_factory=list)]


class RequestQueueHeadWithLocks(RequestQueueHead):
"""Model for request queue head with locks."""

lock_secs: Annotated[int, Field(alias='lockSecs')]


class BaseListPage(BaseModel):
"""Model for a single page of storage items returned from a collection list method.
Args:
count: Count of the returned objects on this page.
offset: The offset of the first object specified in the API call.
limit: The limit on the number of returned objects specified in the API call.
total: Total number of objects matching the API call criteria.
desc: Whether the listing is descending or not.
"""

model_config = ConfigDict(populate_by_name=True)

count: Annotated[int, Field(default=0)]
offset: Annotated[int, Field(default=0)]
limit: Annotated[int, Field(default=0)]
total: Annotated[int, Field(default=0)]
desc: Annotated[bool, Field(default=False)]


class DatasetListPage(BaseListPage):
"""Model for a single page of dataset items returned from a collection list method.
Args:
items: List of returned dataset items on this page.
"""

items: Annotated[list[DatasetMetadata], Field(default_factory=list)]


class KeyValueStoreListPage(BaseListPage):
"""Model for a single page of key-value store items returned from a collection list method.
Args:
items: List of returned key-value store items on this page.
"""

items: Annotated[list[KeyValueStoreMetadata], Field(default_factory=list)]


class RequestQueueListPage(BaseListPage):
"""Model for a single page of request queue items returned from a collection list method.
Args:
items: List of returned request queue items on this page.
"""

items: Annotated[list[RequestQueueMetadata], Field(default_factory=list)]


class DatasetItemsListPage(BaseListPage):
"""Model for a single page of dataset items returned from a collection list method.
Args:
items: List of returned dataset items on this page.
"""

items: Annotated[list[dict], Field(default_factory=list)]


class ProlongRequestLockResponse(BaseModel):
"""Response to prolong request lock calls."""

model_config = ConfigDict(populate_by_name=True)

lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')]


class ProcessedRequest(BaseModel):
"""Represents a processed request."""

model_config = ConfigDict(populate_by_name=True)

id: Annotated[str, Field(alias='id')]
unique_key: Annotated[str, Field(alias='uniqueKey')]
was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')]
was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')]


class UnprocessedRequest(BaseModel):
"""Represents an unprocessed request."""

model_config = ConfigDict(populate_by_name=True)

unique_key: Annotated[str, Field(alias='requestUniqueKey')]
url: Annotated[str, BeforeValidator(validate_http_url), Field()]
method: Annotated[HttpMethod | None, Field()] = None


class BatchRequestsOperationResponse(BaseModel):
"""Response to batch request deletion calls."""

model_config = ConfigDict(populate_by_name=True)

processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')]
unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')]
9 changes: 5 additions & 4 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# ruff: noqa: TCH003
from __future__ import annotations

import logging
import re
from collections.abc import Coroutine, Iterator, Mapping, Sequence
from dataclasses import dataclass, field
from enum import Enum
Expand All @@ -11,8 +8,12 @@
from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack

if TYPE_CHECKING:
import logging
import re

from crawlee import Glob
from crawlee._models import BaseRequestData, DatasetItemsListPage, Request
from crawlee._request import BaseRequestData, Request
from crawlee.base_storage_client._models import DatasetItemsListPage
from crawlee.http_clients import HttpResponse
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions._session import Session
Expand Down
3 changes: 2 additions & 1 deletion src/crawlee/_utils/system.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# ruff: noqa: TCH003
# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic)

from __future__ import annotations

import os
Expand Down
38 changes: 38 additions & 0 deletions src/crawlee/base_storage_client/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,26 @@
from ._base_request_queue_client import BaseRequestQueueClient
from ._base_request_queue_collection_client import BaseRequestQueueCollectionClient
from ._base_storage_client import BaseStorageClient
from ._models import (
BatchRequestsOperationResponse,
DatasetItemsListPage,
DatasetListPage,
DatasetMetadata,
KeyValueStoreKeyInfo,
KeyValueStoreListKeysPage,
KeyValueStoreListPage,
KeyValueStoreMetadata,
KeyValueStoreRecord,
KeyValueStoreRecordMetadata,
ProcessedRequest,
ProlongRequestLockResponse,
RequestQueueHead,
RequestQueueHeadState,
RequestQueueHeadWithLocks,
RequestQueueListPage,
RequestQueueMetadata,
UnprocessedRequest,
)

__all__ = [
'BaseDatasetClient',
Expand All @@ -14,4 +34,22 @@
'BaseRequestQueueClient',
'BaseRequestQueueCollectionClient',
'BaseStorageClient',
'BatchRequestsOperationResponse',
'DatasetItemsListPage',
'DatasetListPage',
'DatasetMetadata',
'KeyValueStoreKeyInfo',
'KeyValueStoreListKeysPage',
'KeyValueStoreListPage',
'KeyValueStoreMetadata',
'KeyValueStoreRecord',
'KeyValueStoreRecordMetadata',
'ProcessedRequest',
'ProlongRequestLockResponse',
'RequestQueueHead',
'RequestQueueHeadState',
'RequestQueueHeadWithLocks',
'RequestQueueListPage',
'RequestQueueMetadata',
'UnprocessedRequest',
]
2 changes: 1 addition & 1 deletion src/crawlee/base_storage_client/_base_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
if TYPE_CHECKING:
from httpx import Response

from crawlee._models import DatasetItemsListPage, DatasetMetadata
from crawlee._types import JsonSerializable
from crawlee.base_storage_client._models import DatasetItemsListPage, DatasetMetadata


class BaseDatasetClient(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from crawlee._models import DatasetListPage, DatasetMetadata
from crawlee.base_storage_client._models import DatasetListPage, DatasetMetadata


class BaseDatasetCollectionClient(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
if TYPE_CHECKING:
from httpx import Response

from crawlee._models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
from crawlee.base_storage_client._models import (
KeyValueStoreListKeysPage,
KeyValueStoreMetadata,
KeyValueStoreRecord,
)


class BaseKeyValueStoreClient(ABC):
Expand Down
Loading

0 comments on commit 3b6e581

Please sign in to comment.