Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor!: declare private and public interface #456

Merged
merged 10 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy


async def main() -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy


async def main() -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy


async def main() -> None:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from crawlee import EnqueueStrategy
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.enqueue_strategy import EnqueueStrategy


async def main() -> None:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/fill_and_submit_web_form_crawler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio

from crawlee import Request
from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
from crawlee.models import Request


async def main() -> None:
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/code/fill_and_submit_web_form_request.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from crawlee.models import Request
from crawlee import Request

# Prepare a POST request to the form endpoint.
request = Request.from_url(
Expand Down
2 changes: 1 addition & 1 deletion docs/introduction/code/08_routes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from crawlee.basic_crawler import Router
from crawlee.playwright_crawler import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ playwright = ["playwright"]
parsel = ["parsel"]

[tool.poetry.scripts]
crawlee = "crawlee.cli:cli"
crawlee = "crawlee._cli:cli"

[tool.ruff]
line-length = 120
Expand Down
2 changes: 2 additions & 0 deletions src/crawlee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from importlib import metadata

from ._models import Request
from ._types import ConcurrencySettings, EnqueueStrategy
from ._utils.globs import Glob

__version__ = metadata.version('crawlee')
5 changes: 5 additions & 0 deletions src/crawlee/_autoscaling/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .autoscaled_pool import AutoscaledPool
from .snapshotter import Snapshotter
from .system_status import SystemStatus

__all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus']
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@
from logging import getLogger
from typing import TYPE_CHECKING, Awaitable, Callable

from crawlee._types import ConcurrencySettings
from crawlee._utils.recurring_task import RecurringTask

if TYPE_CHECKING:
from crawlee.autoscaling import SystemStatus

__all__ = ['ConcurrencySettings', 'AutoscaledPool']
from crawlee._autoscaling import SystemStatus

logger = getLogger(__name__)

Expand All @@ -23,48 +22,6 @@ class AbortError(Exception):
"""Raised when an AutoscaledPool run is aborted. Not for direct use."""


class ConcurrencySettings:
"""Concurrency settings for AutoscaledPool."""

def __init__(
self,
min_concurrency: int = 1,
max_concurrency: int = 200,
max_tasks_per_minute: float = float('inf'),
desired_concurrency: int | None = None,
) -> None:
"""Initialize the ConcurrencySettings.

Args:
min_concurrency: The minimum number of tasks running in parallel. If you set this value too high
with respect to the available system memory and CPU, your code might run extremely slow or crash.

max_concurrency: The maximum number of tasks running in parallel.

max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set
to infinity, but you can pass any positive, non-zero number.

desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
if there is a large enough supply of them. By default, it is `min_concurrency`.
"""
if desired_concurrency is not None and desired_concurrency < 1:
raise ValueError('desired_concurrency must be 1 or larger')

if min_concurrency < 1:
raise ValueError('min_concurrency must be 1 or larger')

if max_concurrency < min_concurrency:
raise ValueError('max_concurrency cannot be less than min_concurrency')

if max_tasks_per_minute <= 0:
raise ValueError('max_tasks_per_minute must be positive')

self.min_concurrency = min_concurrency
self.max_concurrency = max_concurrency
self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
self.max_tasks_per_minute = max_tasks_per_minute


class _AutoscaledPoolRun:
def __init__(self) -> None:
self.worker_tasks = list[asyncio.Task]()
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

import psutil

from crawlee._autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
from crawlee._utils.byte_size import ByteSize
from crawlee._utils.recurring_task import RecurringTask
from crawlee.autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
from crawlee.events.types import Event, EventSystemInfoData
from crawlee.events._types import Event, EventSystemInfoData

if TYPE_CHECKING:
from types import TracebackType
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@

from more_itertools import pairwise

from crawlee._autoscaling.types import LoadRatioInfo, Snapshot, SystemInfo
from crawlee._utils.math import compute_weighted_avg
from crawlee.autoscaling.types import LoadRatioInfo, Snapshot, SystemInfo

if TYPE_CHECKING:
from crawlee.autoscaling import Snapshotter
from crawlee._autoscaling import Snapshotter

logger = getLogger(__name__)

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 1 addition & 2 deletions src/crawlee/models.py → src/crawlee/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,9 @@
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
from typing_extensions import Self, TypeVar

from crawlee._types import EnqueueStrategy, HttpMethod
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.types import HttpMethod


class BaseRequestData(BaseModel):
Expand Down
73 changes: 60 additions & 13 deletions src/crawlee/types.py → src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,68 @@

if TYPE_CHECKING:
from crawlee import Glob
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.http_clients.base import HttpResponse
from crawlee.models import BaseRequestData, DatasetItemsListPage, Request
from crawlee._models import BaseRequestData, DatasetItemsListPage, Request
from crawlee.http_clients import HttpResponse
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions.session import Session
from crawlee.storages.dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs
from crawlee.sessions._session import Session
from crawlee.storages._dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs

# Type for representing json-serializable values. It's close enough to the real thing supported
# by json.parse, and the best we can do until mypy supports recursive types. It was suggested
# in a discussion with (and approved by) Guido van Rossum, so I'd consider it correct enough.
JSONSerializable: TypeAlias = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
JsonSerializable: TypeAlias = Union[str, int, float, bool, None, dict[str, Any], list[Any]]

HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']


class EnqueueStrategy(str, Enum):
"""Strategy for deciding which links should be followed and which ones should be ignored."""

ALL = 'all'
SAME_DOMAIN = 'same-domain'
SAME_HOSTNAME = 'same-hostname'
SAME_ORIGIN = 'same-origin'


class ConcurrencySettings:
"""Concurrency settings for AutoscaledPool."""

def __init__(
self,
min_concurrency: int = 1,
max_concurrency: int = 200,
max_tasks_per_minute: float = float('inf'),
desired_concurrency: int | None = None,
) -> None:
"""Creates a new instance.

Args:
min_concurrency: The minimum number of tasks running in parallel. If you set this value too high
with respect to the available system memory and CPU, your code might run extremely slow or crash.
max_concurrency: The maximum number of tasks running in parallel.
max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set
to infinity, but you can pass any positive, non-zero number.
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
if there is a large enough supply of them. By default, it is `min_concurrency`.
"""
if desired_concurrency is not None and desired_concurrency < 1:
raise ValueError('desired_concurrency must be 1 or larger')

if min_concurrency < 1:
raise ValueError('min_concurrency must be 1 or larger')

if max_concurrency < min_concurrency:
raise ValueError('max_concurrency cannot be less than min_concurrency')

if max_tasks_per_minute <= 0:
raise ValueError('max_tasks_per_minute must be positive')

self.min_concurrency = min_concurrency
self.max_concurrency = max_concurrency
self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
self.max_tasks_per_minute = max_tasks_per_minute


class StorageTypes(str, Enum):
"""Possible Crawlee storage types."""

Expand Down Expand Up @@ -58,7 +105,7 @@ class AddRequestsFunction(Protocol):
request provider and adds the requests to it.
"""

def __call__( # noqa: D102
def __call__(
self,
requests: Sequence[str | BaseRequestData | Request],
**kwargs: Unpack[AddRequestsKwargs],
Expand All @@ -72,7 +119,7 @@ class GetDataFunction(Protocol):
dataset and then retrieves the data based on the provided parameters.
"""

def __call__( # noqa: D102
def __call__(
self,
dataset_id: str | None = None,
dataset_name: str | None = None,
Expand All @@ -87,9 +134,9 @@ class PushDataFunction(Protocol):
dataset and then pushes the provided data to it.
"""

def __call__( # noqa: D102
def __call__(
self,
data: JSONSerializable,
data: JsonSerializable,
dataset_id: str | None = None,
dataset_name: str | None = None,
**kwargs: Unpack[PushDataKwargs],
Expand All @@ -103,7 +150,7 @@ class ExportToFunction(Protocol):
dataset and then exports its content to the key-value store.
"""

def __call__( # noqa: D102
def __call__(
self,
dataset_id: str | None = None,
dataset_name: str | None = None,
Expand All @@ -121,7 +168,7 @@ class EnqueueLinksFunction(Protocol):
**kwargs: Additional arguments for the `add_requests` method.
"""

def __call__( # noqa: D102
def __call__(
self,
*,
selector: str = 'a',
Expand All @@ -134,7 +181,7 @@ def __call__( # noqa: D102
class SendRequestFunction(Protocol):
"""Type of a function for performing an HTTP request."""

def __call__( # noqa: D102
def __call__(
self,
url: str,
*,
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_utils/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from crawlee._utils.file import ContentType, is_content_type

if TYPE_CHECKING:
from crawlee.types import StorageTypes
from crawlee._types import StorageTypes


def filter_out_none_values_recursively(dictionary: dict, *, remove_empty_dicts: bool = False) -> dict | None:
Expand Down
5 changes: 0 additions & 5 deletions src/crawlee/autoscaling/__init__.py

This file was deleted.

14 changes: 7 additions & 7 deletions src/crawlee/base_storage_client/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from .base_dataset_client import BaseDatasetClient
from .base_dataset_collection_client import BaseDatasetCollectionClient
from .base_key_value_store_client import BaseKeyValueStoreClient
from .base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient
from .base_request_queue_client import BaseRequestQueueClient
from .base_request_queue_collection_client import BaseRequestQueueCollectionClient
from .base_storage_client import BaseStorageClient
from ._base_dataset_client import BaseDatasetClient
from ._base_dataset_collection_client import BaseDatasetCollectionClient
from ._base_key_value_store_client import BaseKeyValueStoreClient
from ._base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient
from ._base_request_queue_client import BaseRequestQueueClient
from ._base_request_queue_collection_client import BaseRequestQueueCollectionClient
from ._base_storage_client import BaseStorageClient

__all__ = [
'BaseDatasetClient',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
if TYPE_CHECKING:
from httpx import Response

from crawlee.models import DatasetItemsListPage, DatasetMetadata
from crawlee.types import JSONSerializable
from crawlee._models import DatasetItemsListPage, DatasetMetadata
from crawlee._types import JsonSerializable


class BaseDatasetClient(ABC):
Expand Down Expand Up @@ -212,7 +212,7 @@ async def stream_items(
"""

@abstractmethod
async def push_items(self, items: JSONSerializable) -> None:
async def push_items(self, items: JsonSerializable) -> None:
"""Push items to the dataset.

Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from crawlee.models import DatasetListPage, DatasetMetadata
from crawlee._models import DatasetListPage, DatasetMetadata


class BaseDatasetCollectionClient(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
if TYPE_CHECKING:
from httpx import Response

from crawlee.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
from crawlee._models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord


class BaseKeyValueStoreClient(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from crawlee.models import KeyValueStoreListPage, KeyValueStoreMetadata
from crawlee._models import KeyValueStoreListPage, KeyValueStoreMetadata


class BaseKeyValueStoreCollectionClient(ABC):
Expand Down
Loading
Loading