apify · vdusek · Aug 23, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 23, 2024
diff --git a/docs/examples/code/crawl_website_with_relative_links_all_links.py b/docs/examples/code/crawl_website_with_relative_links_all_links.py
@@ -1,7 +1,7 @@
 import asyncio
 
+from crawlee import EnqueueStrategy
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-from crawlee.enqueue_strategy import EnqueueStrategy
 
 
 async def main() -> None:

diff --git a/docs/examples/code/crawl_website_with_relative_links_same_domain.py b/docs/examples/code/crawl_website_with_relative_links_same_domain.py
@@ -1,7 +1,7 @@
 import asyncio
 
+from crawlee import EnqueueStrategy
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-from crawlee.enqueue_strategy import EnqueueStrategy
 
 
 async def main() -> None:

diff --git a/docs/examples/code/crawl_website_with_relative_links_same_hostname.py b/docs/examples/code/crawl_website_with_relative_links_same_hostname.py
@@ -1,7 +1,7 @@
 import asyncio
 
+from crawlee import EnqueueStrategy
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-from crawlee.enqueue_strategy import EnqueueStrategy
 
 
 async def main() -> None:

diff --git a/docs/examples/code/crawl_website_with_relative_links_same_origin.py b/docs/examples/code/crawl_website_with_relative_links_same_origin.py
@@ -1,7 +1,7 @@
 import asyncio
 
+from crawlee import EnqueueStrategy
 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
-from crawlee.enqueue_strategy import EnqueueStrategy
 
 
 async def main() -> None:

diff --git a/docs/examples/code/fill_and_submit_web_form_crawler.py b/docs/examples/code/fill_and_submit_web_form_crawler.py
@@ -1,7 +1,7 @@
 import asyncio
 
+from crawlee import Request
 from crawlee.http_crawler import HttpCrawler, HttpCrawlingContext
-from crawlee.models import Request
 
 
 async def main() -> None:

diff --git a/docs/examples/code/fill_and_submit_web_form_request.py b/docs/examples/code/fill_and_submit_web_form_request.py
@@ -1,4 +1,4 @@
-from crawlee.models import Request
+from crawlee import Request
 
 # Prepare a POST request to the form endpoint.
 request = Request.from_url(

diff --git a/docs/introduction/code/08_routes.py b/docs/introduction/code/08_routes.py
@@ -1,5 +1,5 @@
-from crawlee.basic_crawler import Router
 from crawlee.playwright_crawler import PlaywrightCrawlingContext
+from crawlee.router import Router
 
 router = Router[PlaywrightCrawlingContext]()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -98,7 +98,7 @@ playwright = ["playwright"]
 parsel = ["parsel"]
 
 [tool.poetry.scripts]
-crawlee = "crawlee.cli:cli"
+crawlee = "crawlee._cli:cli"
 
 [tool.ruff]
 line-length = 120

diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py
@@ -1,5 +1,7 @@
 from importlib import metadata
 
+from ._models import Request
+from ._types import ConcurrencySettings, EnqueueStrategy
 from ._utils.globs import Glob
 
 __version__ = metadata.version('crawlee')
diff --git a/src/crawlee/_autoscaling/__init__.py b/src/crawlee/_autoscaling/__init__.py
@@ -0,0 +1,5 @@
+from .autoscaled_pool import AutoscaledPool
+from .snapshotter import Snapshotter
+from .system_status import SystemStatus
+
+__all__ = ['AutoscaledPool', 'Snapshotter', 'SystemStatus']
diff --git a/src/crawlee/autoscaling/autoscaled_pool.py → src/crawlee/_autoscaling/autoscaled_pool.py b/src/crawlee/autoscaling/autoscaled_pool.py → src/crawlee/_autoscaling/autoscaled_pool.py
@@ -9,12 +9,11 @@
 from logging import getLogger
 from typing import TYPE_CHECKING, Awaitable, Callable
 
+from crawlee._types import ConcurrencySettings
 from crawlee._utils.recurring_task import RecurringTask
 
 if TYPE_CHECKING:
-    from crawlee.autoscaling import SystemStatus
-
-__all__ = ['ConcurrencySettings', 'AutoscaledPool']
+    from crawlee._autoscaling import SystemStatus
 
 logger = getLogger(__name__)
 
@@ -23,48 +22,6 @@ class AbortError(Exception):
     """Raised when an AutoscaledPool run is aborted. Not for direct use."""
 
 
-class ConcurrencySettings:
-    """Concurrency settings for AutoscaledPool."""
-
-    def __init__(
-        self,
-        min_concurrency: int = 1,
-        max_concurrency: int = 200,
-        max_tasks_per_minute: float = float('inf'),
-        desired_concurrency: int | None = None,
-    ) -> None:
-        """Initialize the ConcurrencySettings.
-
-        Args:
-            min_concurrency: The minimum number of tasks running in parallel. If you set this value too high
-                with respect to the available system memory and CPU, your code might run extremely slow or crash.
-
-            max_concurrency: The maximum number of tasks running in parallel.
-
-            max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set
-                to infinity, but you can pass any positive, non-zero number.
-
-            desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
-                if there is a large enough supply of them. By default, it is `min_concurrency`.
-        """
-        if desired_concurrency is not None and desired_concurrency < 1:
-            raise ValueError('desired_concurrency must be 1 or larger')
-
-        if min_concurrency < 1:
-            raise ValueError('min_concurrency must be 1 or larger')
-
-        if max_concurrency < min_concurrency:
-            raise ValueError('max_concurrency cannot be less than min_concurrency')
-
-        if max_tasks_per_minute <= 0:
-            raise ValueError('max_tasks_per_minute must be positive')
-
-        self.min_concurrency = min_concurrency
-        self.max_concurrency = max_concurrency
-        self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
-        self.max_tasks_per_minute = max_tasks_per_minute
-
-
 class _AutoscaledPoolRun:
     def __init__(self) -> None:
         self.worker_tasks = list[asyncio.Task]()

diff --git a/src/crawlee/autoscaling/py.typed → src/crawlee/_autoscaling/py.typed b/src/crawlee/autoscaling/py.typed → src/crawlee/_autoscaling/py.typed
diff --git a/src/crawlee/autoscaling/snapshotter.py → src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/autoscaling/snapshotter.py → src/crawlee/_autoscaling/snapshotter.py
@@ -8,10 +8,10 @@
 
 import psutil
 
+from crawlee._autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
 from crawlee._utils.byte_size import ByteSize
 from crawlee._utils.recurring_task import RecurringTask
-from crawlee.autoscaling.types import ClientSnapshot, CpuSnapshot, EventLoopSnapshot, MemorySnapshot, Snapshot
-from crawlee.events.types import Event, EventSystemInfoData
+from crawlee.events._types import Event, EventSystemInfoData
 
 if TYPE_CHECKING:
     from types import TracebackType

diff --git a/src/crawlee/autoscaling/system_status.py → src/crawlee/_autoscaling/system_status.py b/src/crawlee/autoscaling/system_status.py → src/crawlee/_autoscaling/system_status.py
@@ -8,11 +8,11 @@
 
 from more_itertools import pairwise
 
+from crawlee._autoscaling.types import LoadRatioInfo, Snapshot, SystemInfo
 from crawlee._utils.math import compute_weighted_avg
-from crawlee.autoscaling.types import LoadRatioInfo, Snapshot, SystemInfo
 
 if TYPE_CHECKING:
-    from crawlee.autoscaling import Snapshotter
+    from crawlee._autoscaling import Snapshotter
 
 logger = getLogger(__name__)
 

diff --git a/src/crawlee/autoscaling/types.py → src/crawlee/_autoscaling/types.py b/src/crawlee/autoscaling/types.py → src/crawlee/_autoscaling/types.py
diff --git a/src/crawlee/cli.py → src/crawlee/_cli.py b/src/crawlee/cli.py → src/crawlee/_cli.py
diff --git a/src/crawlee/consts.py → src/crawlee/_consts.py b/src/crawlee/consts.py → src/crawlee/_consts.py
diff --git a/src/crawlee/log_config.py → src/crawlee/_log_config.py b/src/crawlee/log_config.py → src/crawlee/_log_config.py
diff --git a/src/crawlee/models.py → src/crawlee/_models.py b/src/crawlee/models.py → src/crawlee/_models.py
@@ -10,10 +10,9 @@
 from pydantic import BaseModel, BeforeValidator, ConfigDict, Field
 from typing_extensions import Self, TypeVar
 
+from crawlee._types import EnqueueStrategy, HttpMethod
 from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
 from crawlee._utils.urls import extract_query_params, validate_http_url
-from crawlee.enqueue_strategy import EnqueueStrategy
-from crawlee.types import HttpMethod
 
 
 class BaseRequestData(BaseModel):

diff --git a/src/crawlee/types.py → src/crawlee/_types.py b/src/crawlee/types.py → src/crawlee/_types.py
@@ -12,21 +12,68 @@
 
 if TYPE_CHECKING:
     from crawlee import Glob
-    from crawlee.enqueue_strategy import EnqueueStrategy
-    from crawlee.http_clients.base import HttpResponse
-    from crawlee.models import BaseRequestData, DatasetItemsListPage, Request
+    from crawlee._models import BaseRequestData, DatasetItemsListPage, Request
+    from crawlee.http_clients import HttpResponse
     from crawlee.proxy_configuration import ProxyInfo
-    from crawlee.sessions.session import Session
-    from crawlee.storages.dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs
+    from crawlee.sessions._session import Session
+    from crawlee.storages._dataset import ExportToKwargs, GetDataKwargs, PushDataKwargs
 
 # Type for representing json-serializable values. It's close enough to the real thing supported
 # by json.parse, and the best we can do until mypy supports recursive types. It was suggested
 # in a discussion with (and approved by) Guido van Rossum, so I'd consider it correct enough.
-JSONSerializable: TypeAlias = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
+JsonSerializable: TypeAlias = Union[str, int, float, bool, None, dict[str, Any], list[Any]]
 
 HttpMethod: TypeAlias = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']
 
 
+class EnqueueStrategy(str, Enum):
+    """Strategy for deciding which links should be followed and which ones should be ignored."""
+
+    ALL = 'all'
+    SAME_DOMAIN = 'same-domain'
+    SAME_HOSTNAME = 'same-hostname'
+    SAME_ORIGIN = 'same-origin'
+
+
+class ConcurrencySettings:
+    """Concurrency settings for AutoscaledPool."""
+
+    def __init__(
+        self,
+        min_concurrency: int = 1,
+        max_concurrency: int = 200,
+        max_tasks_per_minute: float = float('inf'),
+        desired_concurrency: int | None = None,
+    ) -> None:
+        """Creates a new instance.
+
+        Args:
+            min_concurrency: The minimum number of tasks running in parallel. If you set this value too high
+                with respect to the available system memory and CPU, your code might run extremely slow or crash.
+            max_concurrency: The maximum number of tasks running in parallel.
+            max_tasks_per_minute: The maximum number of tasks per minute the pool can run. By default, this is set
+                to infinity, but you can pass any positive, non-zero number.
+            desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
+                if there is a large enough supply of them. By default, it is `min_concurrency`.
+        """
+        if desired_concurrency is not None and desired_concurrency < 1:
+            raise ValueError('desired_concurrency must be 1 or larger')
+
+        if min_concurrency < 1:
+            raise ValueError('min_concurrency must be 1 or larger')
+
+        if max_concurrency < min_concurrency:
+            raise ValueError('max_concurrency cannot be less than min_concurrency')
+
+        if max_tasks_per_minute <= 0:
+            raise ValueError('max_tasks_per_minute must be positive')
+
+        self.min_concurrency = min_concurrency
+        self.max_concurrency = max_concurrency
+        self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
+        self.max_tasks_per_minute = max_tasks_per_minute
+
+
 class StorageTypes(str, Enum):
     """Possible Crawlee storage types."""
 
@@ -58,7 +105,7 @@ class AddRequestsFunction(Protocol):
     request provider and adds the requests to it.
     """
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
         requests: Sequence[str | BaseRequestData | Request],
         **kwargs: Unpack[AddRequestsKwargs],
@@ -72,7 +119,7 @@ class GetDataFunction(Protocol):
     dataset and then retrieves the data based on the provided parameters.
     """
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
         dataset_id: str | None = None,
         dataset_name: str | None = None,
@@ -87,9 +134,9 @@ class PushDataFunction(Protocol):
     dataset and then pushes the provided data to it.
     """
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
-        data: JSONSerializable,
+        data: JsonSerializable,
         dataset_id: str | None = None,
         dataset_name: str | None = None,
         **kwargs: Unpack[PushDataKwargs],
@@ -103,7 +150,7 @@ class ExportToFunction(Protocol):
     dataset and then exports its content to the key-value store.
     """
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
         dataset_id: str | None = None,
         dataset_name: str | None = None,
@@ -121,7 +168,7 @@ class EnqueueLinksFunction(Protocol):
         **kwargs: Additional arguments for the `add_requests` method.
     """
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
         *,
         selector: str = 'a',
@@ -134,7 +181,7 @@ def __call__(  # noqa: D102
 class SendRequestFunction(Protocol):
     """Type of a function for performing an HTTP request."""
 
-    def __call__(  # noqa: D102
+    def __call__(
         self,
         url: str,
         *,

diff --git a/src/crawlee/_utils/data_processing.py b/src/crawlee/_utils/data_processing.py
@@ -7,7 +7,7 @@
 from crawlee._utils.file import ContentType, is_content_type
 
 if TYPE_CHECKING:
-    from crawlee.types import StorageTypes
+    from crawlee._types import StorageTypes
 
 
 def filter_out_none_values_recursively(dictionary: dict, *, remove_empty_dicts: bool = False) -> dict | None:

diff --git a/src/crawlee/autoscaling/__init__.py b/src/crawlee/autoscaling/__init__.py
diff --git a/src/crawlee/base_storage_client/__init__.py b/src/crawlee/base_storage_client/__init__.py
@@ -1,10 +1,10 @@
-from .base_dataset_client import BaseDatasetClient
-from .base_dataset_collection_client import BaseDatasetCollectionClient
-from .base_key_value_store_client import BaseKeyValueStoreClient
-from .base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient
-from .base_request_queue_client import BaseRequestQueueClient
-from .base_request_queue_collection_client import BaseRequestQueueCollectionClient
-from .base_storage_client import BaseStorageClient
+from ._base_dataset_client import BaseDatasetClient
+from ._base_dataset_collection_client import BaseDatasetCollectionClient
+from ._base_key_value_store_client import BaseKeyValueStoreClient
+from ._base_key_value_store_collection_client import BaseKeyValueStoreCollectionClient
+from ._base_request_queue_client import BaseRequestQueueClient
+from ._base_request_queue_collection_client import BaseRequestQueueCollectionClient
+from ._base_storage_client import BaseStorageClient
 
 __all__ = [
     'BaseDatasetClient',

diff --git a/...ase_storage_client/base_dataset_client.py → ...se_storage_client/_base_dataset_client.py b/...ase_storage_client/base_dataset_client.py → ...se_storage_client/_base_dataset_client.py
@@ -6,8 +6,8 @@
 if TYPE_CHECKING:
     from httpx import Response
 
-    from crawlee.models import DatasetItemsListPage, DatasetMetadata
-    from crawlee.types import JSONSerializable
+    from crawlee._models import DatasetItemsListPage, DatasetMetadata
+    from crawlee._types import JsonSerializable
 
 
 class BaseDatasetClient(ABC):
@@ -212,7 +212,7 @@ async def stream_items(
         """
 
     @abstractmethod
-    async def push_items(self, items: JSONSerializable) -> None:
+    async def push_items(self, items: JsonSerializable) -> None:
         """Push items to the dataset.
 
         Args:

diff --git a/..._client/base_dataset_collection_client.py → ...client/_base_dataset_collection_client.py b/..._client/base_dataset_collection_client.py → ...client/_base_dataset_collection_client.py
@@ -4,7 +4,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from crawlee.models import DatasetListPage, DatasetMetadata
+    from crawlee._models import DatasetListPage, DatasetMetadata
 
 
 class BaseDatasetCollectionClient(ABC):

diff --git a/...age_client/base_key_value_store_client.py → ...ge_client/_base_key_value_store_client.py b/...age_client/base_key_value_store_client.py → ...ge_client/_base_key_value_store_client.py
@@ -6,7 +6,7 @@
 if TYPE_CHECKING:
     from httpx import Response
 
-    from crawlee.models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
+    from crawlee._models import KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord
 
 
 class BaseKeyValueStoreClient(ABC):

diff --git a/...base_key_value_store_collection_client.py → ...base_key_value_store_collection_client.py b/...base_key_value_store_collection_client.py → ...base_key_value_store_collection_client.py
@@ -4,7 +4,7 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from crawlee.models import KeyValueStoreListPage, KeyValueStoreMetadata
+    from crawlee._models import KeyValueStoreListPage, KeyValueStoreMetadata
 
 
 class BaseKeyValueStoreCollectionClient(ABC):