diff --git a/pyproject.toml b/pyproject.toml index 286638750..411b3414c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ curl-cffi = { version = ">=0.7.0", optional = true } docutils = ">=0.21.0" eval-type-backport = ">=0.2.0" html5lib = { version = ">=1.0", optional = true } -httpx = { version = ">=0.27.0", extras = ["brotli"] } +httpx = { version = ">=0.27.0", extras = ["brotli", "http2"] } inquirer = ">=3.3.0" lxml = { version = ">=5.2.0", optional = true } more_itertools = ">=10.2.0" diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py index 82bb7d317..4324348e9 100644 --- a/src/crawlee/http_clients/_base.py +++ b/src/crawlee/http_clients/_base.py @@ -20,8 +20,9 @@ class HttpResponse(Protocol): """This protocol defines the interface that any HTTP response object must implement.""" - def read(self) -> bytes: - """Read the content of the response body.""" + @property + def http_version(self) -> str: + """The HTTP version used in the response.""" @property def status_code(self) -> int: @@ -31,11 +32,17 @@ def status_code(self) -> int: def headers(self) -> dict[str, str]: """The HTTP headers received in the response.""" + def read(self) -> bytes: + """Read the content of the response body.""" + @dataclass(frozen=True) class HttpCrawlingResult: """Result of a HTTP-only crawl. + Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`, + `ParselCrawlingContext`, ...). + Args: http_response: The HTTP response received from the server. """ diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py index efa27082a..08577a38b 100644 --- a/src/crawlee/http_clients/_httpx.py +++ b/src/crawlee/http_clients/_httpx.py @@ -1,5 +1,6 @@ from __future__ import annotations +from logging import getLogger from typing import TYPE_CHECKING, Any, Optional, cast import httpx @@ -18,6 +19,8 @@ from crawlee.proxy_configuration import ProxyInfo from crawlee.statistics import Statistics +logger = getLogger(__name__) + class _HttpxResponse: """Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol.""" @@ -25,8 +28,9 @@ class _HttpxResponse: def __init__(self, response: httpx.Response) -> None: self._response = response - def read(self) -> bytes: - return self._response.read() + @property + def http_version(self) -> str: + return self._response.http_version @property def status_code(self) -> int: @@ -36,6 +40,9 @@ def status_code(self) -> int: def headers(self) -> dict[str, str]: return dict(self._response.headers.items()) + def read(self) -> bytes: + return self._response.read() + class _HttpxTransport(httpx.AsyncHTTPTransport): """HTTP transport adapter that stores response cookies in a `Session`. @@ -76,6 +83,8 @@ def __init__( persist_cookies_per_session: bool = True, additional_http_error_status_codes: Iterable[int] = (), ignore_http_error_status_codes: Iterable[int] = (), + http1: bool = True, + http2: bool = True, **async_client_kwargs: Any, ) -> None: """Create a new instance. @@ -84,6 +93,8 @@ def __init__( persist_cookies_per_session: Whether to persist cookies per HTTP session. additional_http_error_status_codes: Additional HTTP status codes to treat as errors. ignore_http_error_status_codes: HTTP status codes to ignore as errors. + http1: Whether to enable HTTP/1.1 support. + http2: Whether to enable HTTP/2 support. async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`. """ super().__init__( @@ -91,6 +102,8 @@ def __init__( additional_http_error_status_codes=additional_http_error_status_codes, ignore_http_error_status_codes=ignore_http_error_status_codes, ) + self._http1 = http1 + self._http2 = http2 self._async_client_kwargs = async_client_kwargs self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]() @@ -182,11 +195,23 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient: If the client for the given proxy URL doesn't exist, it will be created and stored. """ if proxy_url not in self._client_by_proxy_url: - self._client_by_proxy_url[proxy_url] = httpx.AsyncClient( - transport=_HttpxTransport(), - proxy=proxy_url, - **self._async_client_kwargs, - ) + # Prepare a default kwargs for the new client. + kwargs: dict[str, Any] = { + 'transport': _HttpxTransport( + proxy=proxy_url, + http1=self._http1, + http2=self._http2, + ), + 'proxy': proxy_url, + 'http1': self._http1, + 'http2': self._http2, + } + + # Update the default kwargs with any additional user-provided kwargs. + kwargs.update(self._async_client_kwargs) + + client = httpx.AsyncClient(**kwargs) + self._client_by_proxy_url[proxy_url] = client return self._client_by_proxy_url[proxy_url] diff --git a/src/crawlee/http_clients/curl_impersonate.py b/src/crawlee/http_clients/curl_impersonate.py index 12c59fd90..650930188 100644 --- a/src/crawlee/http_clients/curl_impersonate.py +++ b/src/crawlee/http_clients/curl_impersonate.py @@ -12,6 +12,7 @@ "For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.", ) from exc +from curl_cffi.const import CurlHttpVersion from typing_extensions import override from crawlee._utils.blocked import ROTATE_PROXY_ERRORS @@ -36,8 +37,24 @@ class _CurlImpersonateResponse: def __init__(self, response: Response) -> None: self._response = response - def read(self) -> bytes: - return self._response.content + @property + def http_version(self) -> str: + if self._response.http_version == CurlHttpVersion.NONE: + return 'NONE' + if self._response.http_version == CurlHttpVersion.V1_0: + return 'HTTP/1.0' + if self._response.http_version == CurlHttpVersion.V1_1: + return 'HTTP/1.1' + if self._response.http_version in { + CurlHttpVersion.V2_0, + CurlHttpVersion.V2TLS, + CurlHttpVersion.V2_PRIOR_KNOWLEDGE, + }: + return 'HTTP/2' + if self._response.http_version == CurlHttpVersion.V3: + return 'HTTP/3' + + raise ValueError(f'Unknown HTTP version: {self._response.http_version}') @property def status_code(self) -> int: @@ -47,6 +64,9 @@ def status_code(self) -> int: def headers(self) -> dict[str, str]: return dict(self._response.headers.items()) + def read(self) -> bytes: + return self._response.content + class CurlImpersonateHttpClient(BaseHttpClient): """HTTP client based on the `curl-cffi` library. diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py index 3edb4f191..9e9a61ce5 100644 --- a/tests/unit/http_clients/test_httpx.py +++ b/tests/unit/http_clients/test_httpx.py @@ -19,6 +19,18 @@ def http_client() -> HttpxHttpClient: return HttpxHttpClient() +async def test_http_1(httpbin: str) -> None: + http_client = HttpxHttpClient(http1=True, http2=False) + response = await http_client.send_request(httpbin) + assert response.http_version == 'HTTP/1.1' + + +async def test_http_2(httpbin: str) -> None: + http_client = HttpxHttpClient(http2=True) + response = await http_client.send_request(httpbin) + assert response.http_version == 'HTTP/2' + + @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows') async def test_proxy( http_client: HttpxHttpClient,