diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index e6b821178..d1f4000f5 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -76,18 +76,39 @@ def from_url( cls, url: str, *, + method: HttpMethod = 'GET', + payload: str | None = None, label: str | None = None, unique_key: str | None = None, + id: str | None = None, + keep_url_fragment: bool = False, + use_extended_unique_key: bool = False, **kwargs: Any, ) -> Self: - """Create a new `RequestData` instance from a URL.""" - unique_key = unique_key or compute_unique_key(url) - result = cls(url=url, unique_key=unique_key, **kwargs) + """Create a new `BaseRequestData` instance from a URL. See `Request.from_url` for more details.""" + unique_key = unique_key or compute_unique_key( + url, + method=method, + payload=payload, + keep_url_fragment=keep_url_fragment, + use_extended_unique_key=use_extended_unique_key, + ) + + id = id or unique_key_to_request_id(unique_key) + + request = cls( + url=url, + unique_key=unique_key, + id=id, + method=method, + payload=payload, + **kwargs, + ) if label is not None: - result.user_data['label'] = label + request.user_data['label'] = label - return result + return request def get_query_param_from_url(self, param: str, *, default: str | None = None) -> str | None: """Get the value of a specific query parameter from the URL.""" @@ -112,21 +133,61 @@ def from_url( cls, url: str, *, + method: HttpMethod = 'GET', + payload: str | None = None, label: str | None = None, unique_key: str | None = None, id: str | None = None, + keep_url_fragment: bool = False, + use_extended_unique_key: bool = False, **kwargs: Any, ) -> Self: - """Create a new `RequestData` instance from a URL.""" - unique_key = unique_key or compute_unique_key(url) + """Create a new `Request` instance from a URL. + + This is recommended constructor for creating new `Request` instances. It generates a `Request` object from + a given URL with additional options to customize HTTP method, payload, unique key, and other request + properties. If no `unique_key` or `id` is provided, they are computed automatically based on the URL, + method and payload. It depends on the `keep_url_fragment` and `use_extended_unique_key` flags. + + Args: + url: The URL of the request. + method: The HTTP method of the request. + payload: The data to be sent as the request body. Typically used with 'POST' or 'PUT' requests. + label: A custom label to differentiate between request types. This is stored in `user_data`, and it is + used for request routing (different requests go to different handlers). + unique_key: A unique key identifying the request. If not provided, it is automatically computed based on + the URL and other parameters. Requests with the same `unique_key` are treated as identical. + id: A unique identifier for the request. If not provided, it is automatically generated from the + `unique_key`. + keep_url_fragment: Determines whether the URL fragment (e.g., `#section`) should be included in + the `unique_key` computation. This is only relevant when `unique_key` is not provided. + use_extended_unique_key: Determines whether to include the HTTP method and payload in the `unique_key` + computation. This is only relevant when `unique_key` is not provided. + **kwargs: Additional request properties. + """ + unique_key = unique_key or compute_unique_key( + url, + method=method, + payload=payload, + keep_url_fragment=keep_url_fragment, + use_extended_unique_key=use_extended_unique_key, + ) + id = id or unique_key_to_request_id(unique_key) - result = cls(url=url, unique_key=unique_key, id=id, **kwargs) + request = cls( + url=url, + unique_key=unique_key, + id=id, + method=method, + payload=payload, + **kwargs, + ) if label is not None: - result.user_data['label'] = label + request.user_data['label'] = label - return result + return request @classmethod def from_base_request_data(cls, base_request_data: BaseRequestData, *, id: str | None = None) -> Self: diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py index 51b16cf29..d18eb38c6 100644 --- a/src/crawlee/_utils/requests.py +++ b/src/crawlee/_utils/requests.py @@ -83,7 +83,7 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: def compute_unique_key( url: str, method: str = 'GET', - payload: bytes | None = None, + payload: str | None = None, *, keep_url_fragment: bool = False, use_extended_unique_key: bool = False, @@ -115,7 +115,8 @@ def compute_unique_key( # Compute and return the extended unique key if required. if use_extended_unique_key: - payload_hash = compute_short_hash(payload) if payload else '' + payload_in_bytes = payload.encode() if payload else b'' + payload_hash = compute_short_hash(payload_in_bytes) return f'{normalized_method}({payload_hash}):{normalized_url}' # Log information if there is a non-GET request with a payload. diff --git a/src/crawlee/storages/_request_list.py b/src/crawlee/storages/_request_list.py index e7fbc0c59..224a49112 100644 --- a/src/crawlee/storages/_request_list.py +++ b/src/crawlee/storages/_request_list.py @@ -6,6 +6,7 @@ from typing_extensions import override +from crawlee.base_storage_client._models import ProcessedRequest from crawlee.storages._request_provider import RequestProvider if TYPE_CHECKING: @@ -15,7 +16,13 @@ class RequestList(RequestProvider): - """Represents a (potentially very large) list of URLs to crawl.""" + """Represents a (potentially very large) list of URLs to crawl. + + Disclaimer: The `RequestList` class is in an early, alpha version and is not fully implemented. It is currently + intended for testing purposes and small-scale projects. The current implementation is only in-memory storage + and is very limited. It will be (re)implemented in the future. For more details, see the GitHub issue: + https://github.com/apify/crawlee-python/issues/99. For production usage we recommend to use the `RequestQueue`. + """ def __init__( self, @@ -25,8 +32,8 @@ def __init__( """Initialize the RequestList. Args: - requests: the URLs (or crawling requests) to crawl - name: a name of the request list + requests: The request objects (or their string representations) to be added to the provider. + name: A name of the request list. """ self._name = name or '' self._handled_count = 0 @@ -83,6 +90,27 @@ async def mark_request_as_handled(self, request: Request) -> None: async def get_handled_count(self) -> int: return self._handled_count + @override + async def add_request( + self, + request: str | Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + request = self._transform_request(request) + + if forefront: + self._requests.appendleft(request) + else: + self._requests.append(request) + + return ProcessedRequest( + id=request.id, + unique_key=request.unique_key, + was_already_handled=False, + was_already_present=False, + ) + @override async def add_requests_batched( self, diff --git a/src/crawlee/storages/_request_provider.py b/src/crawlee/storages/_request_provider.py index 6b589bb7e..7b1e15221 100644 --- a/src/crawlee/storages/_request_provider.py +++ b/src/crawlee/storages/_request_provider.py @@ -55,6 +55,24 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | async def get_handled_count(self) -> int: """Returns the number of handled requests.""" + @abstractmethod + async def add_request( + self, + request: str | Request, + *, + forefront: bool = False, + ) -> ProcessedRequest: + """Add a single request to the provider and store it in underlying resource client. + + Args: + request: The request object (or its string representation) to be added to the provider. + forefront: Determines whether the request should be added to the beginning (if True) or the end (if False) + of the provider. + + Returns: + Information about the request addition to the provider. + """ + @abstractmethod async def add_requests_batched( self, diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py index 308843741..11d47a08e 100644 --- a/src/crawlee/storages/_request_queue.py +++ b/src/crawlee/storages/_request_queue.py @@ -177,36 +177,13 @@ async def drop(self, *, timeout: timedelta | None = None) -> None: await self._resource_client.delete() remove_storage_from_cache(storage_class=self.__class__, id=self._id, name=self._name) + @override async def add_request( self, request: str | Request, *, forefront: bool = False, ) -> ProcessedRequest: - """Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue. - - The deduplication of requests relies on the `unique_key` field within the request dictionary. If `unique_key` - exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`, - and `payload` fields. The generation of `unique_key` can be influenced by the `keep_url_fragment` and - `use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method - and payload, respectively, in its computation. - - The request can be added to the forefront (beginning) or the back of the queue based on the `forefront` - parameter. Information about the request's addition to the queue, including whether it was already present or - handled, is returned in an output dictionary. - - Args: - request: The request object to be added to the queue. Must include at least the `url` key. - Optionaly it can include the `method`, `payload` and `unique_key` keys. - forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end. - keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained - in the `unique_key` computation. - use_extended_unique_key: Determines whether to use an extended `unique_key`, incorporating the request's - method and payload into the `unique_key` computation. - - Returns: - Information about the processed request. - """ request = self._transform_request(request) self._last_activity = datetime.now(timezone.utc) diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py index 63cbb3908..26f117421 100644 --- a/tests/unit/_utils/test_requests.py +++ b/tests/unit/_utils/test_requests.py @@ -77,12 +77,12 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo [ ('http://example.com', 'GET', None, False, False, 'http://example.com'), ('http://example.com', 'POST', None, False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, False, 'http://example.com'), - ('http://example.com', 'GET', b'data', False, True, 'GET(3a6eb079):http://example.com'), - ('http://example.com', 'POST', b'data', False, True, 'POST(3a6eb079):http://example.com'), + ('http://example.com', 'GET', 'data', False, False, 'http://example.com'), + ('http://example.com', 'GET', 'data', False, True, 'GET(3a6eb079):http://example.com'), + ('http://example.com', 'POST', 'data', False, True, 'POST(3a6eb079):http://example.com'), ('http://example.com#fragment', 'GET', None, True, False, 'http://example.com#fragment'), ('http://example.com#fragment', 'GET', None, False, False, 'http://example.com'), - ('http://example.com', 'DELETE', b'test', False, True, 'DELETE(9f86d081):http://example.com'), + ('http://example.com', 'DELETE', 'test', False, True, 'DELETE(9f86d081):http://example.com'), ('https://example.com?utm_content=test', 'GET', None, False, False, 'https://example.com'), ('https://example.com?utm_content=test', 'GET', None, True, False, 'https://example.com'), ], @@ -102,7 +102,7 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo def test_compute_unique_key( url: str, method: str, - payload: bytes | None, + payload: str | None, *, keep_url_fragment: bool, use_extended_unique_key: bool,