Skip to content

Commit

Permalink
Mainly improve Request.from_url constructor
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek committed Sep 11, 2024
1 parent 66f5253 commit 5eca0bd
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 44 deletions.
81 changes: 71 additions & 10 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,39 @@ def from_url(
cls,
url: str,
*,
method: HttpMethod = 'GET',
payload: str | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
**kwargs: Any,
) -> Self:
"""Create a new `RequestData` instance from a URL."""
unique_key = unique_key or compute_unique_key(url)
result = cls(url=url, unique_key=unique_key, **kwargs)
"""Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details."""
unique_key = unique_key or compute_unique_key(
url,
method=method,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)

id = id or unique_key_to_request_id(unique_key)

request = cls(
url=url,
unique_key=unique_key,
id=id,
method=method,
payload=payload,
**kwargs,
)

if label is not None:
result.user_data['label'] = label
request.user_data['label'] = label

return result
return request

def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None:
"""Get the value of a specific query parameter from the URL."""
Expand All @@ -112,21 +133,61 @@ def from_url(
cls,
url: str,
*,
method: HttpMethod = 'GET',
payload: str | None = None,
label: str | None = None,
unique_key: str | None = None,
id: str | None = None,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
**kwargs: Any,
) -> Self:
"""Create a new `RequestData` instance from a URL."""
unique_key = unique_key or compute_unique_key(url)
"""Create a new `Request` instance from a URL.
This is recommended constructor for creating new `Request` instances. It generates a `Request` object from
a given URL with additional options to customize HTTP method, payload, unique key, and other request
properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL,
method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags.
Args:
url: The URL of the request.
method: The HTTP method of the request.
payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests.
label: A custom label to differentiate between request types. This is stored in `user_data`, and it is
used for request routing (different requests go to different handlers).
unique_key: A unique key identifying the request. If not provided, it is automatically computed based on
the URL and other parameters. Requests with the same `unique_key` are treated as identical.
id: A unique identifier for the request. If not provided, it is automatically generated from the
`unique_key`.
keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in
the `unique_key` computation. This is only relevant when `unique_key` is not provided.
use_extended_unique_key: Determines whether to include the HTTP method and payload in the `unique_key`
computation. This is only relevant when `unique_key` is not provided.
**kwargs: Additional request properties.
"""
unique_key = unique_key or compute_unique_key(
url,
method=method,
payload=payload,
keep_url_fragment=keep_url_fragment,
use_extended_unique_key=use_extended_unique_key,
)

id = id or unique_key_to_request_id(unique_key)

result = cls(url=url, unique_key=unique_key, id=id, **kwargs)
request = cls(
url=url,
unique_key=unique_key,
id=id,
method=method,
payload=payload,
**kwargs,
)

if label is not None:
result.user_data['label'] = label
request.user_data['label'] = label

return result
return request

@classmethod
def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self:
Expand Down
5 changes: 3 additions & 2 deletions src/crawlee/_utils/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
def compute_unique_key(
url: str,
method: str = 'GET',
payload: bytes | None = None,
payload: str | None = None,
*,
keep_url_fragment: bool = False,
use_extended_unique_key: bool = False,
Expand Down Expand Up @@ -115,7 +115,8 @@ def compute_unique_key(

# Compute and return the extended unique key if required.
if use_extended_unique_key:
payload_hash = compute_short_hash(payload) if payload else ''
payload_in_bytes = payload.encode() if payload else b''
payload_hash = compute_short_hash(payload_in_bytes)
return f'{normalized_method}({payload_hash}):{normalized_url}'

# Log information if there is a non-GET request with a payload.
Expand Down
34 changes: 31 additions & 3 deletions src/crawlee/storages/_request_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from typing_extensions import override

from crawlee.base_storage_client._models import ProcessedRequest
from crawlee.storages._request_provider import RequestProvider

if TYPE_CHECKING:
Expand All @@ -15,7 +16,13 @@


class RequestList(RequestProvider):
"""Represents a (potentially very large) list of URLs to crawl."""
"""Represents a (potentially very large) list of URLs to crawl.
Disclaimer: The `RequestList` class is in an early, alpha version and is not fully implemented. It is currently
intended for testing purposes and small-scale projects. The current implementation is only in-memory storage
and is very limited. It will be (re)implemented in the future. For more details, see the GitHub issue:
https://github.com/apify/crawlee-python/issues/99. For production usage we recommend to use the `RequestQueue`.
"""

def __init__(
self,
Expand All @@ -25,8 +32,8 @@ def __init__(
"""Initialize the RequestList.
Args:
requests: the URLs (or crawling requests) to crawl
name: a name of the request list
requests: The request objects (or their string representations) to be added to the provider.
name: A name of the request list.
"""
self._name = name or ''
self._handled_count = 0
Expand Down Expand Up @@ -83,6 +90,27 @@ async def mark_request_as_handled(self, request: Request) -> None:
async def get_handled_count(self) -> int:
return self._handled_count

@override
async def add_request(
self,
request: str | Request,
*,
forefront: bool = False,
) -> ProcessedRequest:
request = self._transform_request(request)

if forefront:
self._requests.appendleft(request)
else:
self._requests.append(request)

return ProcessedRequest(
id=request.id,
unique_key=request.unique_key,
was_already_handled=False,
was_already_present=False,
)

@override
async def add_requests_batched(
self,
Expand Down
18 changes: 18 additions & 0 deletions src/crawlee/storages/_request_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,24 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest |
async def get_handled_count(self) -> int:
"""Returns the number of handled requests."""

@abstractmethod
async def add_request(
self,
request: str | Request,
*,
forefront: bool = False,
) -> ProcessedRequest:
"""Add a single request to the provider and store it in underlying resource client.
Args:
request: The request object (or its string representation) to be added to the provider.
forefront: Determines whether the request should be added to the beginning (if True) or the end (if False)
of the provider.
Returns:
Information about the request addition to the provider.
"""

@abstractmethod
async def add_requests_batched(
self,
Expand Down
25 changes: 1 addition & 24 deletions src/crawlee/storages/_request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,36 +177,13 @@ async def drop(self, *, timeout: timedelta | None = None) -> None:
await self._resource_client.delete()
remove_storage_from_cache(storage_class=self.__class__, id=self._id, name=self._name)

@override
async def add_request(
self,
request: str | Request,
*,
forefront: bool = False,
) -> ProcessedRequest:
"""Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue.
The deduplication of requests relies on the `unique_key` field within the request dictionary. If `unique_key`
exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`,
and `payload` fields. The generation of `unique_key` can be influenced by the `keep_url_fragment` and
`use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method
and payload, respectively, in its computation.
The request can be added to the forefront (beginning) or the back of the queue based on the `forefront`
parameter. Information about the request's addition to the queue, including whether it was already present or
handled, is returned in an output dictionary.
Args:
request: The request object to be added to the queue. Must include at least the `url` key.
Optionaly it can include the `method`, `payload` and `unique_key` keys.
forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end.
keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained
in the `unique_key` computation.
use_extended_unique_key: Determines whether to use an extended `unique_key`, incorporating the request's
method and payload into the `unique_key` computation.
Returns:
Information about the processed request.
"""
request = self._transform_request(request)
self._last_activity = datetime.now(timezone.utc)

Expand Down
10 changes: 5 additions & 5 deletions tests/unit/_utils/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
[
('http://example.com', 'GET', None, False, False, 'http://example.com'),
('http://example.com', 'POST', None, False, False, 'http://example.com'),
('http://example.com', 'GET', b'data', False, False, 'http://example.com'),
('http://example.com', 'GET', b'data', False, True, 'GET(3a6eb079):http://example.com'),
('http://example.com', 'POST', b'data', False, True, 'POST(3a6eb079):http://example.com'),
('http://example.com', 'GET', 'data', False, False, 'http://example.com'),
('http://example.com', 'GET', 'data', False, True, 'GET(3a6eb079):http://example.com'),
('http://example.com', 'POST', 'data', False, True, 'POST(3a6eb079):http://example.com'),
('http://example.com#fragment', 'GET', None, True, False, 'http://example.com#fragment'),
('http://example.com#fragment', 'GET', None, False, False, 'http://example.com'),
('http://example.com', 'DELETE', b'test', False, True, 'DELETE(9f86d081):http://example.com'),
('http://example.com', 'DELETE', 'test', False, True, 'DELETE(9f86d081):http://example.com'),
('https://example.com?utm_content=test', 'GET', None, False, False, 'https://example.com'),
('https://example.com?utm_content=test', 'GET', None, True, False, 'https://example.com'),
],
Expand All @@ -102,7 +102,7 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo
def test_compute_unique_key(
url: str,
method: str,
payload: bytes | None,
payload: str | None,
*,
keep_url_fragment: bool,
use_extended_unique_key: bool,
Expand Down

0 comments on commit 5eca0bd

Please sign in to comment.