Skip to content

Commit

Permalink
feat: add HTTP/2 support for HTTPX client (#513)
Browse files Browse the repository at this point in the history
### Description

- Add HTTP/2 support for HTTPX client.

### Issues

- Closes #512

### Testing

- New unit tests were added.

### Checklist

- [x] CI passed
  • Loading branch information
vdusek authored Sep 10, 2024
1 parent 1313317 commit 0eb0a33
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 12 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ curl-cffi = { version = ">=0.7.0", optional = true }
docutils = ">=0.21.0"
eval-type-backport = ">=0.2.0"
html5lib = { version = ">=1.0", optional = true }
httpx = { version = ">=0.27.0", extras = ["brotli"] }
httpx = { version = ">=0.27.0", extras = ["brotli", "http2"] }
inquirer = ">=3.3.0"
lxml = { version = ">=5.2.0", optional = true }
more_itertools = ">=10.2.0"
Expand Down
11 changes: 9 additions & 2 deletions src/crawlee/http_clients/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@
class HttpResponse(Protocol):
"""This protocol defines the interface that any HTTP response object must implement."""

def read(self) -> bytes:
"""Read the content of the response body."""
@property
def http_version(self) -> str:
"""The HTTP version used in the response."""

@property
def status_code(self) -> int:
Expand All @@ -31,11 +32,17 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
"""The HTTP headers received in the response."""

def read(self) -> bytes:
"""Read the content of the response body."""


@dataclass(frozen=True)
class HttpCrawlingResult:
"""Result of a HTTP-only crawl.
Mainly for the purpose of composing specific crawling contexts (e.g. `BeautifulSoupCrawlingContext`,
`ParselCrawlingContext`, ...).
Args:
http_response: The HTTP response received from the server.
"""
Expand Down
39 changes: 32 additions & 7 deletions src/crawlee/http_clients/_httpx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from logging import getLogger
from typing import TYPE_CHECKING, Any, Optional, cast

import httpx
Expand All @@ -18,15 +19,18 @@
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics import Statistics

logger = getLogger(__name__)


class _HttpxResponse:
"""Adapter class for `httpx.Response` to conform to the `HttpResponse` protocol."""

def __init__(self, response: httpx.Response) -> None:
self._response = response

def read(self) -> bytes:
return self._response.read()
@property
def http_version(self) -> str:
return self._response.http_version

@property
def status_code(self) -> int:
Expand All @@ -36,6 +40,9 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
return dict(self._response.headers.items())

def read(self) -> bytes:
return self._response.read()


class _HttpxTransport(httpx.AsyncHTTPTransport):
"""HTTP transport adapter that stores response cookies in a `Session`.
Expand Down Expand Up @@ -76,6 +83,8 @@ def __init__(
persist_cookies_per_session: bool = True,
additional_http_error_status_codes: Iterable[int] = (),
ignore_http_error_status_codes: Iterable[int] = (),
http1: bool = True,
http2: bool = True,
**async_client_kwargs: Any,
) -> None:
"""Create a new instance.
Expand All @@ -84,13 +93,17 @@ def __init__(
persist_cookies_per_session: Whether to persist cookies per HTTP session.
additional_http_error_status_codes: Additional HTTP status codes to treat as errors.
ignore_http_error_status_codes: HTTP status codes to ignore as errors.
http1: Whether to enable HTTP/1.1 support.
http2: Whether to enable HTTP/2 support.
async_client_kwargs: Additional keyword arguments for `httpx.AsyncClient`.
"""
super().__init__(
persist_cookies_per_session=persist_cookies_per_session,
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
)
self._http1 = http1
self._http2 = http2
self._async_client_kwargs = async_client_kwargs

self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()
Expand Down Expand Up @@ -182,11 +195,23 @@ def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
If the client for the given proxy URL doesn't exist, it will be created and stored.
"""
if proxy_url not in self._client_by_proxy_url:
self._client_by_proxy_url[proxy_url] = httpx.AsyncClient(
transport=_HttpxTransport(),
proxy=proxy_url,
**self._async_client_kwargs,
)
# Prepare a default kwargs for the new client.
kwargs: dict[str, Any] = {
'transport': _HttpxTransport(
proxy=proxy_url,
http1=self._http1,
http2=self._http2,
),
'proxy': proxy_url,
'http1': self._http1,
'http2': self._http2,
}

# Update the default kwargs with any additional user-provided kwargs.
kwargs.update(self._async_client_kwargs)

client = httpx.AsyncClient(**kwargs)
self._client_by_proxy_url[proxy_url] = client

return self._client_by_proxy_url[proxy_url]

Expand Down
24 changes: 22 additions & 2 deletions src/crawlee/http_clients/curl_impersonate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"For example, if you use pip, run `pip install 'crawlee[curl-impersonate]'`.",
) from exc

from curl_cffi.const import CurlHttpVersion
from typing_extensions import override

from crawlee._utils.blocked import ROTATE_PROXY_ERRORS
Expand All @@ -36,8 +37,24 @@ class _CurlImpersonateResponse:
def __init__(self, response: Response) -> None:
self._response = response

def read(self) -> bytes:
return self._response.content
@property
def http_version(self) -> str:
if self._response.http_version == CurlHttpVersion.NONE:
return 'NONE'
if self._response.http_version == CurlHttpVersion.V1_0:
return 'HTTP/1.0'
if self._response.http_version == CurlHttpVersion.V1_1:
return 'HTTP/1.1'
if self._response.http_version in {
CurlHttpVersion.V2_0,
CurlHttpVersion.V2TLS,
CurlHttpVersion.V2_PRIOR_KNOWLEDGE,
}:
return 'HTTP/2'
if self._response.http_version == CurlHttpVersion.V3:
return 'HTTP/3'

raise ValueError(f'Unknown HTTP version: {self._response.http_version}')

@property
def status_code(self) -> int:
Expand All @@ -47,6 +64,9 @@ def status_code(self) -> int:
def headers(self) -> dict[str, str]:
return dict(self._response.headers.items())

def read(self) -> bytes:
return self._response.content


class CurlImpersonateHttpClient(BaseHttpClient):
"""HTTP client based on the `curl-cffi` library.
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/http_clients/test_httpx.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@ def http_client() -> HttpxHttpClient:
return HttpxHttpClient()


async def test_http_1(httpbin: str) -> None:
http_client = HttpxHttpClient(http1=True, http2=False)
response = await http_client.send_request(httpbin)
assert response.http_version == 'HTTP/1.1'


async def test_http_2(httpbin: str) -> None:
http_client = HttpxHttpClient(http2=True)
response = await http_client.send_request(httpbin)
assert response.http_version == 'HTTP/2'


@pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
async def test_proxy(
http_client: HttpxHttpClient,
Expand Down

0 comments on commit 0eb0a33

Please sign in to comment.