Skip to content

Commit

Permalink
use payload, not data
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Sep 24, 2024
1 parent 8d54da3 commit 456b10c
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 36 deletions.
32 changes: 16 additions & 16 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
)
from typing_extensions import Self

from crawlee._types import EnqueueStrategy, HttpData, HttpHeaders, HttpMethod, HttpQueryParams
from crawlee._types import EnqueueStrategy, HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
from crawlee._utils.urls import extract_query_params, validate_http_url

Expand Down Expand Up @@ -130,8 +130,8 @@ class BaseRequestData(BaseModel):
query_params: Annotated[HttpQueryParams, Field(alias='queryParams', default_factory=dict)] = {}
"""URL query parameters."""

data: Annotated[HttpData, Field(default_factory=dict)] = {}
"""Data to be sent in the request body (payload)."""
payload: Annotated[HttpPayload, Field(default_factory=dict)] = {}
"""Data to be sent in the request body."""

user_data: Annotated[
dict[str, JsonValue], # Internally, the model contains `UserData`, this is just for convenience
Expand Down Expand Up @@ -167,7 +167,7 @@ def from_url(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: HttpData | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
Expand All @@ -178,12 +178,12 @@ def from_url(
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
headers = headers or HttpHeaders()
query_params = query_params or {}
data = data or {}
payload = payload or {}

unique_key = unique_key or compute_unique_key(
url,
method=method,
data=data,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)
Expand All @@ -195,7 +195,7 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
data=data,
payload=payload,
**kwargs,
)

Expand All @@ -216,7 +216,7 @@ class Request(BaseRequestData):
The `Request` class is one of the core components in Crawlee, utilized by various components such as request
providers, HTTP clients, crawlers, and more. It encapsulates the essential data for executing web requests,
including the URL, HTTP method, headers, data (payload), and user data. The user data allows custom information
including the URL, HTTP method, headers, payload, and user data. The user data allows custom information
to be stored and persisted throughout the request lifecycle, including its retries.
Key functionalities include managing the request's identifier (`id`), unique key (`unique_key`) that is used
Expand Down Expand Up @@ -247,7 +247,7 @@ def from_url(
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
data: HttpData | None = None,
payload: HttpPayload | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
Expand All @@ -258,16 +258,16 @@ def from_url(
"""Create a new `Request` instance from a URL.
This is recommended constructor for creating new `Request` instances. It generates a `Request` object from
a given URL with additional options to customize HTTP method, data (payload), unique key, and other request
a given URL with additional options to customize HTTP method, payload, unique key, and other request
properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,
method and data (payload). It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.
method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.
Args:
url: The URL of the request.
method: The HTTP method of the request.
headers: The HTTP headers of the request.
query_params: The query parameters of the URL.
data: The data to be sent as the request body (payload). Typically used with 'POST' or 'PUT' requests.
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
used for request routing (different requests go to different handlers).
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
Expand All @@ -276,18 +276,18 @@ def from_url(
`unique_key`.
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
use_extended_unique_key: Determines whether to include the HTTP method and data in the `unique_key`
use_extended_unique_key: Determines whether to include the HTTP method and payload in the `unique_key`
computation. This is only relevant when `unique_key` is not provided.
**kwargs: Additional request properties.
"""
headers = headers or HttpHeaders()
query_params = query_params or {}
data = data or {}
payload = payload or {}

unique_key = unique_key or compute_unique_key(
url,
method=method,
data=data,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)
Expand All @@ -299,7 +299,7 @@ def from_url(
unique_key=unique_key,
id=id,
method=method,
data=data,
payload=payload,
**kwargs,
)

Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

HttpQueryParams: TypeAlias = dict[str, Any]

HttpData: TypeAlias = dict[str, Any]
HttpPayload: TypeAlias = dict[str, Any]


class EnqueueStrategy(str, Enum):
Expand Down
12 changes: 6 additions & 6 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from crawlee._utils.crypto import compute_short_hash

if TYPE_CHECKING:
from crawlee._types import HttpData, HttpMethod
from crawlee._types import HttpMethod, HttpPayload

logger = getLogger(__name__)

Expand Down Expand Up @@ -87,21 +87,21 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
def compute_unique_key(
url: str,
method: HttpMethod = 'GET',
data: HttpData | None = None,
payload: HttpPayload | None = None,
*,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
) -> str:
"""Computes a unique key for caching & deduplication of requests.
This function computes a unique key by normalizing the provided URL and method.
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
If `use_extended_unique_key` is True and a payload is provided, the payload is hashed and
included in the key. Otherwise, the unique key is just the normalized URL.
Args:
url: The request URL.
method: The HTTP method, defaults to 'GET'.
data: The request data (payload), defaults to None.
payload: The data to be sent as the request body, defaults to None.
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
Expand All @@ -119,12 +119,12 @@ def compute_unique_key(

# Compute and return the extended unique key if required.
if use_extended_unique_key:
payload_in_bytes = b'' if data is None else str(data).encode('utf-8')
payload_in_bytes = b'' if payload is None else str(payload).encode('utf-8')
payload_hash = compute_short_hash(payload_in_bytes)
return f'{normalized_method}({payload_hash}):{normalized_url}'

# Log information if there is a non-GET request with a payload.
if normalized_method != 'GET' and data:
if normalized_method != 'GET' and payload:
logger.info(
f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
'that if your requests point to the same URL and differ only in method and payload, you should consider '
Expand Down
6 changes: 5 additions & 1 deletion src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee._types import HttpHeaders, HttpMethod
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -114,6 +114,8 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -125,6 +127,8 @@ async def send_request(
url: The URL to send the request to.
method: The HTTP method to use.
headers: The headers to include in the request.
query_params: The query parameters to include in the request.
payload: The data to be sent as the request body.
session: The session associated with the request.
proxy_info: The information about the proxy to be used.
Expand Down
10 changes: 5 additions & 5 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import httpx
from typing_extensions import override

from crawlee._types import HttpHeaders
from crawlee._types import HttpHeaders, HttpPayload, HttpQueryParams
from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
from crawlee.errors import ProxyError
from crawlee.fingerprint_suite import HeaderGenerator
Expand Down Expand Up @@ -132,7 +132,7 @@ async def crawl(
method=request.method,
headers=headers,
params=request.query_params,
data=request.data,
data=request.payload,
cookies=session.cookies if session else None,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)
Expand Down Expand Up @@ -166,8 +166,8 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: dict[str, Any] | None = None,
data: dict[str, Any] | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -179,7 +179,7 @@ async def send_request(
method=method,
headers=headers,
params=query_params,
data=data,
data=payload,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)

Expand Down
10 changes: 5 additions & 5 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from curl_cffi.requests import Response

from crawlee._types import HttpHeaders, HttpMethod
from crawlee._types import HttpHeaders, HttpMethod, HttpPayload, HttpQueryParams
from crawlee.base_storage_client._models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions import Session
Expand Down Expand Up @@ -120,7 +120,7 @@ async def crawl(
method=str(request.method.upper()), # type: ignore
headers=request.headers,
params=request.query_params,
data=request.data,
data=request.payload,
cookies=session.cookies if session else None,
allow_redirects=True,
)
Expand Down Expand Up @@ -151,8 +151,8 @@ async def send_request(
*,
method: HttpMethod = 'GET',
headers: HttpHeaders | None = None,
query_params: dict[str, Any] | None = None,
data: dict[str, Any] | None = None,
query_params: HttpQueryParams | None = None,
payload: HttpPayload | None = None,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
) -> HttpResponse:
Expand All @@ -165,7 +165,7 @@ async def send_request(
method=method.upper(), # type: ignore
headers=headers,
params=query_params,
data=data,
data=payload,
cookies=session.cookies if session else None,
allow_redirects=True,
)
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from crawlee._utils.requests import compute_unique_key, normalize_url, unique_key_to_request_id

if TYPE_CHECKING:
from crawlee._types import HttpData, HttpMethod
from crawlee._types import HttpMethod, HttpPayload


def test_unique_key_to_request_id_length() -> None:
Expand Down Expand Up @@ -107,7 +107,7 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
def test_compute_unique_key(
url: str,
method: HttpMethod,
data: HttpData,
data: HttpPayload,
*,
keep_url_fragment: bool,
use_extended_unique_key: bool,
Expand Down

0 comments on commit 456b10c

Please sign in to comment.