From 456b10c08f3995a366db6a17d18d86e2da92cad3 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 24 Sep 2024 13:32:12 +0200 Subject: [PATCH] use payload, not data --- src/crawlee/_request.py | 32 ++++++++++---------- src/crawlee/_types.py | 2 +- src/crawlee/_utils/requests.py | 12 ++++---- src/crawlee/http_clients/_base.py | 6 +++- src/crawlee/http_clients/_httpx.py | 10 +++--- src/crawlee/http_clients/curl_impersonate.py | 10 +++--- tests/unit/_utils/test_requests.py | 4 +-- 7 files changed, 40 insertions(+), 36 deletions(-) diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 504bd8ecd..e78b4699e 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -20,7 +20,7 @@ ) from typing_extensions import Self -from crawlee._types import EnqueueStrategy, HttpData, HttpHeaders, HttpMethod, HttpQueryParams +from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id from crawlee._utils.urls import extract_query_params, validate_http_url @@ -130,8 +130,8 @@ class BaseRequestData(BaseModel): query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {} """URL query parameters.""" - data: Annotated[HttpData, Field(default_factory=dict)] = {} - """Data to be sent in the request body (payload).""" + payload: Annotated[HttpPayload, Field(default_factory=dict)] = {} + """Data to be sent in the request body.""" user_data: Annotated[ dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience @@ -167,7 +167,7 @@ def from_url( method: HttpMethod = 'GET', headers: HttpHeaders | None = None, query_params: HttpQueryParams | None = None, - data: HttpData | None = None, + payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, id: str | None = None, @@ -178,12 +178,12 @@ def from_url( """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" headers = headers or HttpHeaders() query_params = query_params or {} - data = data or {} + payload = payload or {} unique_key = unique_key or compute_unique_key( url, method=method, - data=data, + payload=payload, keep_url_fragment=keep_url_fragment, use_extended_unique_key=use_extended_unique_key, ) @@ -195,7 +195,7 @@ def from_url( unique_key=unique_key, id=id, method=method, - data=data, + payload=payload, **kwargs, ) @@ -216,7 +216,7 @@ class Request(BaseRequestData): The `Request` class is one of the core components in Crawlee, utilized by various components such as request providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests, - including the URL, HTTP method, headers, data (payload), and user data. The user data allows custom information + including the URL, HTTP method, headers, payload, and user data. The user data allows custom information to be stored and persisted throughout the request lifecycle, including its retries. Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used @@ -247,7 +247,7 @@ def from_url( method: HttpMethod = 'GET', headers: HttpHeaders | None = None, query_params: HttpQueryParams | None = None, - data: HttpData | None = None, + payload: HttpPayload | None = None, label: str | None = None, unique_key: str | None = None, id: str | None = None, @@ -258,16 +258,16 @@ def from_url( """Create a new `Request` instance from a URL. This is recommended constructor for creating new `Request` instances. It generates a `Request` object from - a given URL with additional options to customize HTTP method, data (payload), unique key, and other request + a given URL with additional options to customize HTTP method, payload, unique key, and other request properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL, - method and data (payload). It depends on the `keep_url_fragment` and `use_extended_unique_key` flags. + method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags. Args: url: The URL of the request. method: The HTTP method of the request. headers: The HTTP headers of the request. query_params: The query parameters of the URL. - data: The data to be sent as the request body (payload). Typically used with 'POST' or 'PUT' requests. + payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests. label: A custom label to differentiate between request types. This is stored in `user_data`, and it is used for request routing (different requests go to different handlers). unique_key: A unique key identifying the request. If not provided, it is automatically computed based on @@ -276,18 +276,18 @@ def from_url( `unique_key`. keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in the `unique_key` computation. This is only relevant when `unique_key` is not provided. - use_extended_unique_key: Determines whether to include the HTTP method and data in the `unique_key` + use_extended_unique_key: Determines whether to include the HTTP method and payload in the `unique_key` computation. This is only relevant when `unique_key` is not provided. **kwargs: Additional request properties. """ headers = headers or HttpHeaders() query_params = query_params or {} - data = data or {} + payload = payload or {} unique_key = unique_key or compute_unique_key( url, method=method, - data=data, + payload=payload, keep_url_fragment=keep_url_fragment, use_extended_unique_key=use_extended_unique_key, ) @@ -299,7 +299,7 @@ def from_url( unique_key=unique_key, id=id, method=method, - data=data, + payload=payload, **kwargs, ) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 1505dd09a..f9107c1ef 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -28,7 +28,7 @@ HttpQueryParams: TypeAlias = dict[str, Any] -HttpData: TypeAlias = dict[str, Any] +HttpPayload: TypeAlias = dict[str, Any] class EnqueueStrategy(str, Enum): diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py index 4122319ab..fd6c0bfaa 100644 --- a/src/crawlee/_utils/requests.py +++ b/src/crawlee/_utils/requests.py @@ -10,7 +10,7 @@ from crawlee._utils.crypto import compute_short_hash if TYPE_CHECKING: - from crawlee._types import HttpData, HttpMethod + from crawlee._types import HttpMethod, HttpPayload logger = getLogger(__name__) @@ -87,7 +87,7 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: def compute_unique_key( url: str, method: HttpMethod = 'GET', - data: HttpData | None = None, + payload: HttpPayload | None = None, *, keep_url_fragment: bool = False, use_extended_unique_key: bool = False, @@ -95,13 +95,13 @@ def compute_unique_key( """Computes a unique key for caching & deduplication of requests. This function computes a unique key by normalizing the provided URL and method. - If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and + If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and included in the key. Otherwise, the unique key is just the normalized URL. Args: url: The request URL. method: The HTTP method, defaults to 'GET'. - data: The request data (payload), defaults to None. + payload: The data to be sent as the request body, defaults to None. keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False. use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False. @@ -119,12 +119,12 @@ def compute_unique_key( # Compute and return the extended unique key if required. if use_extended_unique_key: - payload_in_bytes = b'' if data is None else str(data).encode('utf-8') + payload_in_bytes = b'' if payload is None else str(payload).encode('utf-8') payload_hash = compute_short_hash(payload_in_bytes) return f'{normalized_method}({payload_hash}):{normalized_url}' # Log information if there is a non-GET request with a payload. - if normalized_method != 'GET' and data: + if normalized_method != 'GET' and payload: logger.info( f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know ' 'that if your requests point to the same URL and differ only in method and payload, you should consider ' diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 4324348e9..c754a3c7f 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from collections.abc import Iterable - from crawlee._types import HttpHeaders, HttpMethod + from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session @@ -114,6 +114,8 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, + query_params: HttpQueryParams | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -125,6 +127,8 @@ async def send_request( url: The URL to send the request to. method: The HTTP method to use. headers: The headers to include in the request. + query_params: The query parameters to include in the request. + payload: The data to be sent as the request body. session: The session associated with the request. proxy_info: The information about the proxy to be used. diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index be7e39121..f0636212c 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -6,7 +6,7 @@ import httpx from typing_extensions import override -from crawlee._types import HttpHeaders +from crawlee._types import HttpHeaders, HttpPayload, HttpQueryParams from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee.errors import ProxyError from crawlee.fingerprint_suite import HeaderGenerator @@ -132,7 +132,7 @@ async def crawl( method=request.method, headers=headers, params=request.query_params, - data=request.data, + data=request.payload, cookies=session.cookies if session else None, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, ) @@ -166,8 +166,8 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: dict[str, Any] | None = None, - data: dict[str, Any] | None = None, + query_params: HttpQueryParams | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -179,7 +179,7 @@ async def send_request( method=method, headers=headers, params=query_params, - data=data, + data=payload, extensions={'crawlee_session': session if self._persist_cookies_per_session else None}, ) diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index ac1e22dc8..6cd985eda 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -25,7 +25,7 @@ from curl_cffi.requests import Response - from crawlee._types import HttpHeaders, HttpMethod + from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams from crawlee.base_storage_client._models import Request from crawlee.proxy_configuration import ProxyInfo from crawlee.sessions import Session @@ -120,7 +120,7 @@ async def crawl( method=str(request.method.upper()), # type: ignore headers=request.headers, params=request.query_params, - data=request.data, + data=request.payload, cookies=session.cookies if session else None, allow_redirects=True, ) @@ -151,8 +151,8 @@ async def send_request( *, method: HttpMethod = 'GET', headers: HttpHeaders | None = None, - query_params: dict[str, Any] | None = None, - data: dict[str, Any] | None = None, + query_params: HttpQueryParams | None = None, + payload: HttpPayload | None = None, session: Session | None = None, proxy_info: ProxyInfo | None = None, ) -> HttpResponse: @@ -165,7 +165,7 @@ async def send_request( method=method.upper(), # type: ignore headers=headers, params=query_params, - data=data, + data=payload, cookies=session.cookies if session else None, allow_redirects=True, ) diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py index 5dfee6aab..ba744b37d 100644 --- a/tests/unit/_utils/test_requests.py +++ b/tests/unit/_utils/test_requests.py @@ -7,7 +7,7 @@ from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id if TYPE_CHECKING: - from crawlee._types import HttpData, HttpMethod + from crawlee._types import HttpMethod, HttpPayload def test_unique_key_to_request_id_length() -> None: @@ -107,7 +107,7 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo def test_compute_unique_key( url: str, method: HttpMethod, - data: HttpData, + data: HttpPayload, *, keep_url_fragment: bool, use_extended_unique_key: bool,