diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py index 4bd62c612..87dbc3c42 100644 --- a/src/crawlee/__init__.py +++ b/src/crawlee/__init__.py @@ -1,6 +1,6 @@ from importlib import metadata -from ._models import Request +from ._request import Request from ._types import ConcurrencySettings, EnqueueStrategy from ._utils.globs import Glob diff --git a/src/crawlee/_models.py b/src/crawlee/_request.py similarity index 52% rename from src/crawlee/_models.py rename to src/crawlee/_request.py index cc060ecf0..aa639a4aa 100644 --- a/src/crawlee/_models.py +++ b/src/crawlee/_request.py @@ -1,20 +1,33 @@ -# ruff: noqa: TCH001, TCH002, TCH003 +# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic) from __future__ import annotations from datetime import datetime from decimal import Decimal from enum import Enum -from typing import Annotated, Any, Generic +from typing import Annotated, Any from pydantic import BaseModel, BeforeValidator, ConfigDict, Field -from typing_extensions import Self, TypeVar +from typing_extensions import Self from crawlee._types import EnqueueStrategy, HttpMethod from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id from crawlee._utils.urls import extract_query_params, validate_http_url +class RequestState(Enum): + """Crawlee-specific request handling state.""" + + UNPROCESSED = 0 + BEFORE_NAV = 1 + AFTER_NAV = 2 + REQUEST_HANDLER = 3 + DONE = 4 + ERROR_HANDLER = 5 + ERROR = 6 + SKIPPED = 7 + + class BaseRequestData(BaseModel): """Data needed to create a new crawling request.""" @@ -205,19 +218,6 @@ class RequestWithLock(Request): lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] -class RequestState(Enum): - """Crawlee-specific request handling state.""" - - UNPROCESSED = 0 - BEFORE_NAV = 1 - AFTER_NAV = 2 - REQUEST_HANDLER = 3 - DONE = 4 - ERROR_HANDLER = 5 - ERROR = 6 - SKIPPED = 7 - - class CrawleeRequestData(BaseModel): """Crawlee-specific configuration stored in the user_data.""" @@ -237,217 +237,3 @@ class CrawleeRequestData(BaseModel): last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None forefront: Annotated[bool, Field()] = False - - -class BaseStorageMetadata(BaseModel): - """Base model for storage metadata.""" - - model_config = ConfigDict(populate_by_name=True) - - id: Annotated[str, Field(alias='id')] - name: Annotated[str | None, Field(alias='name', default='')] - accessed_at: Annotated[datetime, Field(alias='accessedAt')] - created_at: Annotated[datetime, Field(alias='createdAt')] - modified_at: Annotated[datetime, Field(alias='modifiedAt')] - - -class DatasetMetadata(BaseStorageMetadata): - """Model for a dataset metadata.""" - - model_config = ConfigDict(populate_by_name=True) - - item_count: Annotated[int, Field(alias='itemCount')] - - -class KeyValueStoreMetadata(BaseStorageMetadata): - """Model for a key-value store metadata.""" - - model_config = ConfigDict(populate_by_name=True) - - user_id: Annotated[str, Field(alias='userId')] - - -class RequestQueueMetadata(BaseStorageMetadata): - """Model for a request queue metadata.""" - - model_config = ConfigDict(populate_by_name=True) - - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] - handled_request_count: Annotated[int, Field(alias='handledRequestCount')] - pending_request_count: Annotated[int, Field(alias='pendingRequestCount')] - stats: Annotated[dict, Field(alias='stats')] - total_request_count: Annotated[int, Field(alias='totalRequestCount')] - user_id: Annotated[str, Field(alias='userId')] - resource_directory: Annotated[str, Field(alias='resourceDirectory')] - - -ValueType = TypeVar('ValueType', default=Any) - - -class KeyValueStoreRecord(BaseModel, Generic[ValueType]): - """Model for a key-value store record.""" - - model_config = ConfigDict(populate_by_name=True) - - key: Annotated[str, Field(alias='key')] - value: Annotated[ValueType, Field(alias='value')] - content_type: Annotated[str | None, Field(alias='contentType', default=None)] - filename: Annotated[str | None, Field(alias='filename', default=None)] - - -class KeyValueStoreRecordMetadata(BaseModel): - """Model for a key-value store record metadata.""" - - model_config = ConfigDict(populate_by_name=True) - - key: Annotated[str, Field(alias='key')] - content_type: Annotated[str, Field(alias='contentType')] - - -class KeyValueStoreKeyInfo(BaseModel): - """Model for a key-value store key info.""" - - model_config = ConfigDict(populate_by_name=True) - - key: Annotated[str, Field(alias='key')] - size: Annotated[int, Field(alias='size')] - - -class KeyValueStoreListKeysPage(BaseModel): - """Model for listing keys in the key-value store.""" - - model_config = ConfigDict(populate_by_name=True) - - count: Annotated[int, Field(alias='count')] - limit: Annotated[int, Field(alias='limit')] - is_truncated: Annotated[bool, Field(alias='isTruncated')] - items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)] - exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] - next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] - - -class RequestQueueHeadState(BaseModel): - """Model for the request queue head state.""" - - model_config = ConfigDict(populate_by_name=True) - - was_limit_reached: Annotated[bool, Field(alias='wasLimitReached')] - prev_limit: Annotated[int, Field(alias='prevLimit')] - queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] - query_started_at: Annotated[datetime, Field(alias='queryStartedAt')] - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] - - -class RequestQueueHead(BaseModel): - """Model for the request queue head.""" - - model_config = ConfigDict(populate_by_name=True) - - limit: Annotated[int | None, Field(alias='limit', default=None)] - had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] - queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] - items: Annotated[list[Request], Field(alias='items', default_factory=list)] - - -class RequestQueueHeadWithLocks(RequestQueueHead): - """Model for request queue head with locks.""" - - lock_secs: Annotated[int, Field(alias='lockSecs')] - - -class BaseListPage(BaseModel): - """Model for a single page of storage items returned from a collection list method. - - Args: - count: Count of the returned objects on this page. - offset: The offset of the first object specified in the API call. - limit: The limit on the number of returned objects specified in the API call. - total: Total number of objects matching the API call criteria. - desc: Whether the listing is descending or not. - """ - - model_config = ConfigDict(populate_by_name=True) - - count: Annotated[int, Field(default=0)] - offset: Annotated[int, Field(default=0)] - limit: Annotated[int, Field(default=0)] - total: Annotated[int, Field(default=0)] - desc: Annotated[bool, Field(default=False)] - - -class DatasetListPage(BaseListPage): - """Model for a single page of dataset items returned from a collection list method. - - Args: - items: List of returned dataset items on this page. - """ - - items: Annotated[list[DatasetMetadata], Field(default_factory=list)] - - -class KeyValueStoreListPage(BaseListPage): - """Model for a single page of key-value store items returned from a collection list method. - - Args: - items: List of returned key-value store items on this page. - """ - - items: Annotated[list[KeyValueStoreMetadata], Field(default_factory=list)] - - -class RequestQueueListPage(BaseListPage): - """Model for a single page of request queue items returned from a collection list method. - - Args: - items: List of returned request queue items on this page. - """ - - items: Annotated[list[RequestQueueMetadata], Field(default_factory=list)] - - -class DatasetItemsListPage(BaseListPage): - """Model for a single page of dataset items returned from a collection list method. - - Args: - items: List of returned dataset items on this page. - """ - - items: Annotated[list[dict], Field(default_factory=list)] - - -class ProlongRequestLockResponse(BaseModel): - """Response to prolong request lock calls.""" - - model_config = ConfigDict(populate_by_name=True) - - lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] - - -class ProcessedRequest(BaseModel): - """Represents a processed request.""" - - model_config = ConfigDict(populate_by_name=True) - - id: Annotated[str, Field(alias='id')] - unique_key: Annotated[str, Field(alias='uniqueKey')] - was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')] - was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')] - - -class UnprocessedRequest(BaseModel): - """Represents an unprocessed request.""" - - model_config = ConfigDict(populate_by_name=True) - - unique_key: Annotated[str, Field(alias='requestUniqueKey')] - url: Annotated[str, BeforeValidator(validate_http_url), Field()] - method: Annotated[HttpMethod | None, Field()] = None - - -class BatchRequestsOperationResponse(BaseModel): - """Response to batch request deletion calls.""" - - model_config = ConfigDict(populate_by_name=True) - - processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')] - unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 087f52919..8b9deb180 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -1,8 +1,5 @@ -# ruff: noqa: TCH003 from __future__ import annotations -import logging -import re from collections.abc import Coroutine, Iterator, Mapping, Sequence from dataclasses import dataclass, field from enum import Enum @@ -11,8 +8,12 @@ from typing_extensions import NotRequired, TypeAlias, TypedDict, Unpack if TYPE_CHECKING: + import logging + import re + from crawlee import Glob - from crawlee._models import BaseRequestData, DatasetItemsListPage, Request + from crawlee._request import BaseRequestData, Request + from crawlee.base_storage_client._models import DatasetItemsListPage from crawlee.http_clients import HttpResponse from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions._session import Session diff --git a/src/crawlee/_utils/system.py b/src/crawlee/_utils/system.py index 9ff0d59e9..8d36ad30e 100644 --- a/src/crawlee/_utils/system.py +++ b/src/crawlee/_utils/system.py @@ -1,4 +1,5 @@ -# ruff: noqa: TCH003 +# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic) + from __future__ import annotations import os diff --git a/src/crawlee/base_storage_client/__init__.py b/src/crawlee/base_storage_client/__init__.py index a2818925c..633a4406a 100644 --- a/src/crawlee/base_storage_client/__init__.py +++ b/src/crawlee/base_storage_client/__init__.py @@ -5,6 +5,26 @@ from ._base_request_queue_client import BaseRequestQueueClient from ._base_request_queue_collection_client import BaseRequestQueueCollectionClient from ._base_storage_client import BaseStorageClient +from ._models import ( + BatchRequestsOperationResponse, + DatasetItemsListPage, + DatasetListPage, + DatasetMetadata, + KeyValueStoreKeyInfo, + KeyValueStoreListKeysPage, + KeyValueStoreListPage, + KeyValueStoreMetadata, + KeyValueStoreRecord, + KeyValueStoreRecordMetadata, + ProcessedRequest, + ProlongRequestLockResponse, + RequestQueueHead, + RequestQueueHeadState, + RequestQueueHeadWithLocks, + RequestQueueListPage, + RequestQueueMetadata, + UnprocessedRequest, +) __all__ = [ 'BaseDatasetClient', @@ -14,4 +34,22 @@ 'BaseRequestQueueClient', 'BaseRequestQueueCollectionClient', 'BaseStorageClient', + 'BatchRequestsOperationResponse', + 'DatasetItemsListPage', + 'DatasetListPage', + 'DatasetMetadata', + 'KeyValueStoreKeyInfo', + 'KeyValueStoreListKeysPage', + 'KeyValueStoreListPage', + 'KeyValueStoreMetadata', + 'KeyValueStoreRecord', + 'KeyValueStoreRecordMetadata', + 'ProcessedRequest', + 'ProlongRequestLockResponse', + 'RequestQueueHead', + 'RequestQueueHeadState', + 'RequestQueueHeadWithLocks', + 'RequestQueueListPage', + 'RequestQueueMetadata', + 'UnprocessedRequest', ] diff --git a/src/crawlee/base_storage_client/_base_dataset_client.py b/src/crawlee/base_storage_client/_base_dataset_client.py index 645f93d51..86e409e21 100644 --- a/src/crawlee/base_storage_client/_base_dataset_client.py +++ b/src/crawlee/base_storage_client/_base_dataset_client.py @@ -6,8 +6,8 @@ if TYPE_CHECKING: from httpx import Response - from crawlee._models import DatasetItemsListPage, DatasetMetadata from crawlee._types import JsonSerializable + from crawlee.base_storage_client._models import DatasetItemsListPage, DatasetMetadata class BaseDatasetClient(ABC): diff --git a/src/crawlee/base_storage_client/_base_dataset_collection_client.py b/src/crawlee/base_storage_client/_base_dataset_collection_client.py index e811a522e..bab4e834f 100644 --- a/src/crawlee/base_storage_client/_base_dataset_collection_client.py +++ b/src/crawlee/base_storage_client/_base_dataset_collection_client.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from crawlee._models import DatasetListPage, DatasetMetadata + from crawlee.base_storage_client._models import DatasetListPage, DatasetMetadata class BaseDatasetCollectionClient(ABC): diff --git a/src/crawlee/base_storage_client/_base_key_value_store_client.py b/src/crawlee/base_storage_client/_base_key_value_store_client.py index 4e7513da5..3eed99089 100644 --- a/src/crawlee/base_storage_client/_base_key_value_store_client.py +++ b/src/crawlee/base_storage_client/_base_key_value_store_client.py @@ -6,7 +6,11 @@ if TYPE_CHECKING: from httpx import Response - from crawlee._models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord + from crawlee.base_storage_client._models import ( + KeyValueStoreListKeysPage, + KeyValueStoreMetadata, + KeyValueStoreRecord, + ) class BaseKeyValueStoreClient(ABC): diff --git a/src/crawlee/base_storage_client/_base_key_value_store_collection_client.py b/src/crawlee/base_storage_client/_base_key_value_store_collection_client.py index 74df7d47f..68929864b 100644 --- a/src/crawlee/base_storage_client/_base_key_value_store_collection_client.py +++ b/src/crawlee/base_storage_client/_base_key_value_store_collection_client.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from crawlee._models import KeyValueStoreListPage, KeyValueStoreMetadata + from crawlee.base_storage_client._models import KeyValueStoreListPage, KeyValueStoreMetadata class BaseKeyValueStoreCollectionClient(ABC): diff --git a/src/crawlee/base_storage_client/_base_request_queue_client.py b/src/crawlee/base_storage_client/_base_request_queue_client.py index 666b4ad70..88aa88c12 100644 --- a/src/crawlee/base_storage_client/_base_request_queue_client.py +++ b/src/crawlee/base_storage_client/_base_request_queue_client.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from collections.abc import Sequence - from crawlee._models import ( + from crawlee.base_storage_client._models import ( BatchRequestsOperationResponse, ProcessedRequest, ProlongRequestLockResponse, diff --git a/src/crawlee/base_storage_client/_base_request_queue_collection_client.py b/src/crawlee/base_storage_client/_base_request_queue_collection_client.py index ffc09dcf1..7ce430ead 100644 --- a/src/crawlee/base_storage_client/_base_request_queue_collection_client.py +++ b/src/crawlee/base_storage_client/_base_request_queue_collection_client.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from crawlee._models import RequestQueueListPage, RequestQueueMetadata + from crawlee.base_storage_client._models import RequestQueueListPage, RequestQueueMetadata class BaseRequestQueueCollectionClient(ABC): diff --git a/src/crawlee/base_storage_client/_models.py b/src/crawlee/base_storage_client/_models.py new file mode 100644 index 000000000..0f3d77aad --- /dev/null +++ b/src/crawlee/base_storage_client/_models.py @@ -0,0 +1,226 @@ +# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic) + +from __future__ import annotations + +from datetime import datetime +from typing import Annotated, Any, Generic + +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field +from typing_extensions import TypeVar + +from crawlee._request import Request +from crawlee._types import HttpMethod +from crawlee._utils.urls import validate_http_url + +KvsValueType = TypeVar('KvsValueType', default=Any) + + +class _BaseStorageMetadata(BaseModel): + """Base model for storage metadata.""" + + model_config = ConfigDict(populate_by_name=True) + + id: Annotated[str, Field(alias='id')] + name: Annotated[str | None, Field(alias='name', default='')] + accessed_at: Annotated[datetime, Field(alias='accessedAt')] + created_at: Annotated[datetime, Field(alias='createdAt')] + modified_at: Annotated[datetime, Field(alias='modifiedAt')] + + +class DatasetMetadata(_BaseStorageMetadata): + """Model for a dataset metadata.""" + + model_config = ConfigDict(populate_by_name=True) + + item_count: Annotated[int, Field(alias='itemCount')] + + +class KeyValueStoreMetadata(_BaseStorageMetadata): + """Model for a key-value store metadata.""" + + model_config = ConfigDict(populate_by_name=True) + + user_id: Annotated[str, Field(alias='userId')] + + +class RequestQueueMetadata(_BaseStorageMetadata): + """Model for a request queue metadata.""" + + model_config = ConfigDict(populate_by_name=True) + + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + handled_request_count: Annotated[int, Field(alias='handledRequestCount')] + pending_request_count: Annotated[int, Field(alias='pendingRequestCount')] + stats: Annotated[dict, Field(alias='stats')] + total_request_count: Annotated[int, Field(alias='totalRequestCount')] + user_id: Annotated[str, Field(alias='userId')] + resource_directory: Annotated[str, Field(alias='resourceDirectory')] + + +class KeyValueStoreRecord(BaseModel, Generic[KvsValueType]): + """Model for a key-value store record.""" + + model_config = ConfigDict(populate_by_name=True) + + key: Annotated[str, Field(alias='key')] + value: Annotated[KvsValueType, Field(alias='value')] + content_type: Annotated[str | None, Field(alias='contentType', default=None)] + filename: Annotated[str | None, Field(alias='filename', default=None)] + + +class KeyValueStoreRecordMetadata(BaseModel): + """Model for a key-value store record metadata.""" + + model_config = ConfigDict(populate_by_name=True) + + key: Annotated[str, Field(alias='key')] + content_type: Annotated[str, Field(alias='contentType')] + + +class KeyValueStoreKeyInfo(BaseModel): + """Model for a key-value store key info.""" + + model_config = ConfigDict(populate_by_name=True) + + key: Annotated[str, Field(alias='key')] + size: Annotated[int, Field(alias='size')] + + +class KeyValueStoreListKeysPage(BaseModel): + """Model for listing keys in the key-value store.""" + + model_config = ConfigDict(populate_by_name=True) + + count: Annotated[int, Field(alias='count')] + limit: Annotated[int, Field(alias='limit')] + is_truncated: Annotated[bool, Field(alias='isTruncated')] + items: Annotated[list[KeyValueStoreKeyInfo], Field(alias='items', default_factory=list)] + exclusive_start_key: Annotated[str | None, Field(alias='exclusiveStartKey', default=None)] + next_exclusive_start_key: Annotated[str | None, Field(alias='nextExclusiveStartKey', default=None)] + + +class RequestQueueHeadState(BaseModel): + """Model for the request queue head state.""" + + model_config = ConfigDict(populate_by_name=True) + + was_limit_reached: Annotated[bool, Field(alias='wasLimitReached')] + prev_limit: Annotated[int, Field(alias='prevLimit')] + queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] + query_started_at: Annotated[datetime, Field(alias='queryStartedAt')] + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + + +class RequestQueueHead(BaseModel): + """Model for the request queue head.""" + + model_config = ConfigDict(populate_by_name=True) + + limit: Annotated[int | None, Field(alias='limit', default=None)] + had_multiple_clients: Annotated[bool, Field(alias='hadMultipleClients')] + queue_modified_at: Annotated[datetime, Field(alias='queueModifiedAt')] + items: Annotated[list[Request], Field(alias='items', default_factory=list)] + + +class RequestQueueHeadWithLocks(RequestQueueHead): + """Model for request queue head with locks.""" + + lock_secs: Annotated[int, Field(alias='lockSecs')] + + +class _BaseListPage(BaseModel): + """Model for a single page of storage items returned from a collection list method. + + Args: + count: Count of the returned objects on this page. + offset: The offset of the first object specified in the API call. + limit: The limit on the number of returned objects specified in the API call. + total: Total number of objects matching the API call criteria. + desc: Whether the listing is descending or not. + """ + + model_config = ConfigDict(populate_by_name=True) + + count: Annotated[int, Field(default=0)] + offset: Annotated[int, Field(default=0)] + limit: Annotated[int, Field(default=0)] + total: Annotated[int, Field(default=0)] + desc: Annotated[bool, Field(default=False)] + + +class DatasetListPage(_BaseListPage): + """Model for a single page of dataset items returned from a collection list method. + + Args: + items: List of returned dataset items on this page. + """ + + items: Annotated[list[DatasetMetadata], Field(default_factory=list)] + + +class KeyValueStoreListPage(_BaseListPage): + """Model for a single page of key-value store items returned from a collection list method. + + Args: + items: List of returned key-value store items on this page. + """ + + items: Annotated[list[KeyValueStoreMetadata], Field(default_factory=list)] + + +class RequestQueueListPage(_BaseListPage): + """Model for a single page of request queue items returned from a collection list method. + + Args: + items: List of returned request queue items on this page. + """ + + items: Annotated[list[RequestQueueMetadata], Field(default_factory=list)] + + +class DatasetItemsListPage(_BaseListPage): + """Model for a single page of dataset items returned from a collection list method. + + Args: + items: List of returned dataset items on this page. + """ + + items: Annotated[list[dict], Field(default_factory=list)] + + +class ProlongRequestLockResponse(BaseModel): + """Response to prolong request lock calls.""" + + model_config = ConfigDict(populate_by_name=True) + + lock_expires_at: Annotated[datetime, Field(alias='lockExpiresAt')] + + +class ProcessedRequest(BaseModel): + """Represents a processed request.""" + + model_config = ConfigDict(populate_by_name=True) + + id: Annotated[str, Field(alias='id')] + unique_key: Annotated[str, Field(alias='uniqueKey')] + was_already_present: Annotated[bool, Field(alias='wasAlreadyPresent')] + was_already_handled: Annotated[bool, Field(alias='wasAlreadyHandled')] + + +class UnprocessedRequest(BaseModel): + """Represents an unprocessed request.""" + + model_config = ConfigDict(populate_by_name=True) + + unique_key: Annotated[str, Field(alias='requestUniqueKey')] + url: Annotated[str, BeforeValidator(validate_http_url), Field()] + method: Annotated[HttpMethod | None, Field()] = None + + +class BatchRequestsOperationResponse(BaseModel): + """Response to batch request deletion calls.""" + + model_config = ConfigDict(populate_by_name=True) + + processed_requests: Annotated[list[ProcessedRequest], Field(alias='processedRequests')] + unprocessed_requests: Annotated[list[UnprocessedRequest], Field(alias='unprocessedRequests')] diff --git a/src/crawlee/basic_crawler/_basic_crawler.py b/src/crawlee/basic_crawler/_basic_crawler.py index 35a44af37..04c7ccb47 100644 --- a/src/crawlee/basic_crawler/_basic_crawler.py +++ b/src/crawlee/basic_crawler/_basic_crawler.py @@ -23,7 +23,7 @@ from crawlee._autoscaling.snapshotter import Snapshotter from crawlee._autoscaling.system_status import SystemStatus from crawlee._log_config import CrawleeLogFormatter -from crawlee._models import BaseRequestData, DatasetItemsListPage, Request, RequestState +from crawlee._request import BaseRequestData, Request, RequestState from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for @@ -45,6 +45,7 @@ import re from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable + from crawlee.base_storage_client._models import DatasetItemsListPage from crawlee.configuration import Configuration from crawlee.events._event_manager import EventManager from crawlee.http_clients import BaseHttpClient, HttpResponse diff --git a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py index aff41c741..883756783 100644 --- a/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/_beautifulsoup_crawler.py @@ -9,7 +9,7 @@ from typing_extensions import Unpack from crawlee import EnqueueStrategy -from crawlee._models import BaseRequestData +from crawlee._request import BaseRequestData from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 428ef62cb..53fe0687e 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Protocol if TYPE_CHECKING: - from crawlee._models import Request from crawlee._types import HttpHeaders, HttpMethod + from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index bc28d4a0a..227766dfa 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -13,8 +13,8 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._models import Request from crawlee._types import HttpHeaders, HttpMethod + from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.statistics import Statistics diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index 484ce9532..f9bd997eb 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -22,8 +22,8 @@ from curl_cffi.requests import Response - from crawlee._models import Request from crawlee._types import HttpHeaders, HttpMethod + from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session from crawlee.statistics import Statistics diff --git a/src/crawlee/memory_storage_client/_creation_management.py b/src/crawlee/memory_storage_client/_creation_management.py index e6210e33d..8d7ba40f2 100644 --- a/src/crawlee/memory_storage_client/_creation_management.py +++ b/src/crawlee/memory_storage_client/_creation_management.py @@ -11,7 +11,9 @@ from typing import TYPE_CHECKING from crawlee._consts import METADATA_FILENAME -from crawlee._models import ( +from crawlee._utils.data_processing import maybe_parse_body +from crawlee._utils.file import json_dumps +from crawlee.base_storage_client._models import ( DatasetMetadata, KeyValueStoreMetadata, KeyValueStoreRecord, @@ -19,8 +21,6 @@ Request, RequestQueueMetadata, ) -from crawlee._utils.data_processing import maybe_parse_body -from crawlee._utils.file import json_dumps from crawlee.storages._dataset import Dataset from crawlee.storages._key_value_store import KeyValueStore from crawlee.storages._request_queue import RequestQueue diff --git a/src/crawlee/memory_storage_client/_dataset_client.py b/src/crawlee/memory_storage_client/_dataset_client.py index 9170cd967..e8cf7e700 100644 --- a/src/crawlee/memory_storage_client/_dataset_client.py +++ b/src/crawlee/memory_storage_client/_dataset_client.py @@ -10,12 +10,12 @@ from typing_extensions import override -from crawlee._models import DatasetItemsListPage, DatasetMetadata from crawlee._types import StorageTypes from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.data_processing import raise_on_duplicate_storage, raise_on_non_existing_storage from crawlee._utils.file import force_rename, json_dumps from crawlee.base_storage_client import BaseDatasetClient +from crawlee.base_storage_client._models import DatasetItemsListPage, DatasetMetadata from crawlee.memory_storage_client._creation_management import find_or_create_client_by_id_or_name_inner if TYPE_CHECKING: diff --git a/src/crawlee/memory_storage_client/_dataset_collection_client.py b/src/crawlee/memory_storage_client/_dataset_collection_client.py index 59b3b5aac..f6c5954de 100644 --- a/src/crawlee/memory_storage_client/_dataset_collection_client.py +++ b/src/crawlee/memory_storage_client/_dataset_collection_client.py @@ -4,8 +4,8 @@ from typing_extensions import override -from crawlee._models import DatasetListPage, DatasetMetadata from crawlee.base_storage_client import BaseDatasetCollectionClient +from crawlee.base_storage_client._models import DatasetListPage, DatasetMetadata from crawlee.memory_storage_client._creation_management import get_or_create_inner from crawlee.memory_storage_client._dataset_client import DatasetClient diff --git a/src/crawlee/memory_storage_client/_key_value_store_client.py b/src/crawlee/memory_storage_client/_key_value_store_client.py index db979b5eb..907070ef6 100644 --- a/src/crawlee/memory_storage_client/_key_value_store_client.py +++ b/src/crawlee/memory_storage_client/_key_value_store_client.py @@ -10,18 +10,18 @@ from typing_extensions import override -from crawlee._models import ( +from crawlee._types import StorageTypes +from crawlee._utils.crypto import crypto_random_object_id +from crawlee._utils.data_processing import maybe_parse_body, raise_on_duplicate_storage, raise_on_non_existing_storage +from crawlee._utils.file import determine_file_extension, force_remove, force_rename, is_file_or_bytes, json_dumps +from crawlee.base_storage_client import BaseKeyValueStoreClient +from crawlee.base_storage_client._models import ( KeyValueStoreKeyInfo, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata, ) -from crawlee._types import StorageTypes -from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.data_processing import maybe_parse_body, raise_on_duplicate_storage, raise_on_non_existing_storage -from crawlee._utils.file import determine_file_extension, force_remove, force_rename, is_file_or_bytes, json_dumps -from crawlee.base_storage_client import BaseKeyValueStoreClient from crawlee.memory_storage_client._creation_management import ( find_or_create_client_by_id_or_name_inner, persist_metadata_if_enabled, diff --git a/src/crawlee/memory_storage_client/_key_value_store_collection_client.py b/src/crawlee/memory_storage_client/_key_value_store_collection_client.py index 550a44b6b..70d4f6c46 100644 --- a/src/crawlee/memory_storage_client/_key_value_store_collection_client.py +++ b/src/crawlee/memory_storage_client/_key_value_store_collection_client.py @@ -4,8 +4,8 @@ from typing_extensions import override -from crawlee._models import KeyValueStoreListPage, KeyValueStoreMetadata from crawlee.base_storage_client import BaseKeyValueStoreCollectionClient +from crawlee.base_storage_client._models import KeyValueStoreListPage, KeyValueStoreMetadata from crawlee.memory_storage_client._creation_management import get_or_create_inner from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient diff --git a/src/crawlee/memory_storage_client/_request_queue_client.py b/src/crawlee/memory_storage_client/_request_queue_client.py index 065b0f5d2..8f6dbb0d7 100644 --- a/src/crawlee/memory_storage_client/_request_queue_client.py +++ b/src/crawlee/memory_storage_client/_request_queue_client.py @@ -12,16 +12,6 @@ from sortedcollections import ValueSortedDict # type: ignore from typing_extensions import override -from crawlee._models import ( - BatchRequestsOperationResponse, - ProcessedRequest, - ProlongRequestLockResponse, - Request, - RequestQueueHead, - RequestQueueHeadWithLocks, - RequestQueueMetadata, - UnprocessedRequest, -) from crawlee._types import StorageTypes from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.data_processing import ( @@ -32,6 +22,16 @@ from crawlee._utils.file import force_remove, force_rename, json_dumps from crawlee._utils.requests import unique_key_to_request_id from crawlee.base_storage_client import BaseRequestQueueClient +from crawlee.base_storage_client._models import ( + BatchRequestsOperationResponse, + ProcessedRequest, + ProlongRequestLockResponse, + Request, + RequestQueueHead, + RequestQueueHeadWithLocks, + RequestQueueMetadata, + UnprocessedRequest, +) from crawlee.memory_storage_client._creation_management import ( find_or_create_client_by_id_or_name_inner, persist_metadata_if_enabled, diff --git a/src/crawlee/memory_storage_client/_request_queue_collection_client.py b/src/crawlee/memory_storage_client/_request_queue_collection_client.py index c410ecc29..fbebcf19d 100644 --- a/src/crawlee/memory_storage_client/_request_queue_collection_client.py +++ b/src/crawlee/memory_storage_client/_request_queue_collection_client.py @@ -4,8 +4,8 @@ from typing_extensions import override -from crawlee._models import RequestQueueListPage, RequestQueueMetadata from crawlee.base_storage_client import BaseRequestQueueCollectionClient +from crawlee.base_storage_client._models import RequestQueueListPage, RequestQueueMetadata from crawlee.memory_storage_client._creation_management import get_or_create_inner from crawlee.memory_storage_client._request_queue_client import RequestQueueClient diff --git a/src/crawlee/parsel_crawler/_parsel_crawler.py b/src/crawlee/parsel_crawler/_parsel_crawler.py index 116c2258d..e560d7f8e 100644 --- a/src/crawlee/parsel_crawler/_parsel_crawler.py +++ b/src/crawlee/parsel_crawler/_parsel_crawler.py @@ -9,7 +9,7 @@ from typing_extensions import Unpack from crawlee import EnqueueStrategy -from crawlee._models import BaseRequestData +from crawlee._request import BaseRequestData from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline diff --git a/src/crawlee/playwright_crawler/_playwright_crawler.py b/src/crawlee/playwright_crawler/_playwright_crawler.py index 17a0d6684..5968b5463 100644 --- a/src/crawlee/playwright_crawler/_playwright_crawler.py +++ b/src/crawlee/playwright_crawler/_playwright_crawler.py @@ -7,7 +7,7 @@ from typing_extensions import Unpack from crawlee import EnqueueStrategy -from crawlee._models import BaseRequestData +from crawlee._request import BaseRequestData from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline diff --git a/src/crawlee/proxy_configuration.py b/src/crawlee/proxy_configuration.py index f5e51f291..20baef495 100644 --- a/src/crawlee/proxy_configuration.py +++ b/src/crawlee/proxy_configuration.py @@ -16,7 +16,7 @@ if TYPE_CHECKING: from collections.abc import Awaitable, Sequence - from crawlee._models import Request + from crawlee.base_storage_client._models import Request __all__ = ['ProxyInfo', 'ProxyConfiguration'] diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index 926b25ced..ad3364267 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -1,4 +1,4 @@ -# ruff: noqa: TCH002 TCH003 +# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic) from __future__ import annotations diff --git a/src/crawlee/statistics/_models.py b/src/crawlee/statistics/_models.py index 5e5c58789..57e5af1c8 100644 --- a/src/crawlee/statistics/_models.py +++ b/src/crawlee/statistics/_models.py @@ -1,4 +1,5 @@ -# ruff: noqa: TCH001 TCH003 +# ruff: noqa: TCH001, TCH002, TCH003 (because of Pydantic) + from __future__ import annotations import json diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py index 4dd7e3a82..0a02258e0 100644 --- a/src/crawlee/storages/_dataset.py +++ b/src/crawlee/storages/_dataset.py @@ -8,16 +8,16 @@ from typing_extensions import NotRequired, Required, Unpack, override -from crawlee._models import DatasetMetadata from crawlee._utils.byte_size import ByteSize from crawlee._utils.file import json_dumps +from crawlee.base_storage_client._models import DatasetMetadata from crawlee.storages._base_storage import BaseStorage from crawlee.storages._key_value_store import KeyValueStore if TYPE_CHECKING: - from crawlee._models import DatasetItemsListPage from crawlee._types import JsonSerializable from crawlee.base_storage_client import BaseStorageClient + from crawlee.base_storage_client._models import DatasetItemsListPage from crawlee.configuration import Configuration diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py index d54875f41..27f629965 100644 --- a/src/crawlee/storages/_key_value_store.py +++ b/src/crawlee/storages/_key_value_store.py @@ -4,7 +4,7 @@ from typing_extensions import override -from crawlee._models import KeyValueStoreKeyInfo, KeyValueStoreMetadata +from crawlee.base_storage_client._models import KeyValueStoreKeyInfo, KeyValueStoreMetadata from crawlee.storages._base_storage import BaseStorage if TYPE_CHECKING: diff --git a/src/crawlee/storages/_request_list.py b/src/crawlee/storages/_request_list.py index eb9d11498..f1105f301 100644 --- a/src/crawlee/storages/_request_list.py +++ b/src/crawlee/storages/_request_list.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from collections.abc import Sequence - from crawlee._models import BaseRequestData, Request + from crawlee._request import BaseRequestData, Request class RequestList(RequestProvider): diff --git a/src/crawlee/storages/_request_provider.py b/src/crawlee/storages/_request_provider.py index f08567baa..01b252662 100644 --- a/src/crawlee/storages/_request_provider.py +++ b/src/crawlee/storages/_request_provider.py @@ -4,12 +4,12 @@ from datetime import timedelta from typing import TYPE_CHECKING -from crawlee._models import BaseRequestData, Request +from crawlee._request import BaseRequestData, Request if TYPE_CHECKING: from collections.abc import Sequence - from crawlee._models import ProcessedRequest + from crawlee.base_storage_client._models import ProcessedRequest class RequestProvider(ABC): diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 775974215..58fd1c831 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -9,16 +9,11 @@ from typing_extensions import override -from crawlee._models import ( - BaseRequestData, - ProcessedRequest, - Request, - RequestQueueMetadata, -) from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.lru_cache import LRUCache from crawlee._utils.requests import unique_key_to_request_id from crawlee._utils.wait import wait_for_all_tasks_for_finish +from crawlee.base_storage_client._models import ProcessedRequest, RequestQueueMetadata from crawlee.events._types import Event from crawlee.storages._base_storage import BaseStorage from crawlee.storages._request_provider import RequestProvider @@ -26,9 +21,10 @@ if TYPE_CHECKING: from collections.abc import Sequence + from crawlee._request import BaseRequestData, Request from crawlee.base_storage_client import BaseStorageClient from crawlee.configuration import Configuration - from crawlee.events._event_manager import EventManager + from crawlee.events import EventManager logger = getLogger(__name__) diff --git a/tests/unit/_memory_storage_client/test_key_value_store_client.py b/tests/unit/_memory_storage_client/test_key_value_store_client.py index 85727abf3..34306dece 100644 --- a/tests/unit/_memory_storage_client/test_key_value_store_client.py +++ b/tests/unit/_memory_storage_client/test_key_value_store_client.py @@ -10,10 +10,10 @@ import pytest from crawlee._consts import METADATA_FILENAME -from crawlee._models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata from crawlee._utils.crypto import crypto_random_object_id from crawlee._utils.data_processing import maybe_parse_body from crawlee._utils.file import json_dumps +from crawlee.base_storage_client._models import KeyValueStoreMetadata, KeyValueStoreRecordMetadata if TYPE_CHECKING: from pathlib import Path diff --git a/tests/unit/basic_crawler/test_basic_crawler.py b/tests/unit/basic_crawler/test_basic_crawler.py index 51b3fbd9f..e8f0104f0 100644 --- a/tests/unit/basic_crawler/test_basic_crawler.py +++ b/tests/unit/basic_crawler/test_basic_crawler.py @@ -15,7 +15,7 @@ import pytest from crawlee import ConcurrencySettings, EnqueueStrategy, Glob -from crawlee._models import BaseRequestData, Request +from crawlee._request import BaseRequestData, Request from crawlee._types import AddRequestsKwargs, BasicCrawlingContext, HttpHeaders from crawlee.basic_crawler import BasicCrawler from crawlee.configuration import Configuration diff --git a/tests/unit/parsel_crawler/test_parsel_crawler.py b/tests/unit/parsel_crawler/test_parsel_crawler.py index e9d8ae954..82b8d2484 100644 --- a/tests/unit/parsel_crawler/test_parsel_crawler.py +++ b/tests/unit/parsel_crawler/test_parsel_crawler.py @@ -9,7 +9,7 @@ from httpx import Response from crawlee import ConcurrencySettings -from crawlee._models import BaseRequestData +from crawlee._request import BaseRequestData from crawlee.parsel_crawler import ParselCrawler from crawlee.storages import RequestList diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index fe3f6d1dc..f89c40358 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -6,7 +6,7 @@ import pytest -from crawlee._models import BaseRequestData, Request +from crawlee._request import BaseRequestData, Request from crawlee.storages import RequestQueue if TYPE_CHECKING: