Skip to content

Commit

Permalink
feat: Proxy configuration (#156)
Browse files Browse the repository at this point in the history
### Description

- closes #136

### TODO

- [x] copy applicable tests from SDK
- [x] add the proxy info to context
- [x] use the configured proxy in HTTP clients
  • Loading branch information
janbuchar authored May 24, 2024
1 parent eeebe9b commit 5c3753a
Show file tree
Hide file tree
Showing 14 changed files with 616 additions and 17 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ types-beautifulsoup4 = "^4.12.0.20240229"
types-colorama = "~0.4.15.20240106"
types-psutil = "~5.9.5.20240205"
types-python-dateutil = "^2.9.0.20240316"
proxy-py = "^2.4.4"

[tool.poetry.extras]
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
Expand Down
37 changes: 32 additions & 5 deletions src/crawlee/basic_crawler/basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
from __future__ import annotations

import tempfile
from collections.abc import AsyncGenerator, Awaitable, Sequence
from contextlib import AsyncExitStack
from datetime import timedelta
from functools import partial
from logging import getLogger
from typing import TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Generic, Sequence, Union, cast
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast

import httpx
from tldextract import TLDExtract
Expand Down Expand Up @@ -46,6 +47,7 @@
import re

from crawlee.http_clients.base_http_client import BaseHttpClient, HttpResponse
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
from crawlee.sessions.session import Session
from crawlee.statistics.models import FinalStatistics, StatisticsState
from crawlee.storages.request_provider import RequestProvider
Expand All @@ -71,6 +73,7 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
session_pool: NotRequired[SessionPool]
use_session_pool: NotRequired[bool]
retry_on_blocked: NotRequired[bool]
proxy_configuration: NotRequired[ProxyConfiguration]
statistics: NotRequired[Statistics[StatisticsState]]
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]

Expand Down Expand Up @@ -100,6 +103,7 @@ def __init__(
session_pool: SessionPool | None = None,
use_session_pool: bool = True,
retry_on_blocked: bool = True,
proxy_configuration: ProxyConfiguration | None = None,
statistics: Statistics | None = None,
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
) -> None:
Expand All @@ -119,6 +123,7 @@ def __init__(
use_session_pool: Enables using the session pool for crawling
session_pool: A preconfigured `SessionPool` instance if you wish to use non-default configuration
retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
proxy_configuration: A HTTP proxy configuration to be used for making requests
statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
_context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
Expand Down Expand Up @@ -169,6 +174,7 @@ def __init__(

self._retry_on_blocked = retry_on_blocked

self._proxy_configuration = proxy_configuration
self._statistics = statistics or Statistics(
event_manager=self._event_manager,
log_message=f'{logger.name} request statistics',
Expand Down Expand Up @@ -211,6 +217,17 @@ async def _get_session(self) -> Session | None:
logger=logger,
)

async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:
"""Retrieve a new ProxyInfo object based on crawler configuration and the current request and session."""
if not self._proxy_configuration:
return None

return await self._proxy_configuration.new_proxy_info(
session_id=session.id if session else None,
request=request,
proxy_tier=None,
)

async def get_request_provider(self) -> RequestProvider:
"""Return the configured request provider. If none is configured, open and return the default request queue."""
if not self._request_provider:
Expand Down Expand Up @@ -411,15 +428,23 @@ async def _handle_failed_request(self, crawling_context: TCrawlingContext, error
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

def _prepare_send_request_function(self, session: Session | None) -> SendRequestFunction:
def _prepare_send_request_function(
self,
session: Session | None,
proxy_info: ProxyInfo | None,
) -> SendRequestFunction:
async def send_request(
url: str,
*,
method: str = 'get',
headers: dict[str, str] | None = None,
) -> HttpResponse:
return await self._http_client.send_request(
url, method=method, headers=httpx.Headers(headers), session=session
url,
method=method,
headers=httpx.Headers(headers),
session=session,
proxy_info=proxy_info,
)

return send_request
Expand Down Expand Up @@ -461,7 +486,7 @@ async def __is_task_ready_function(self) -> bool:
request_provider = await self.get_request_provider()
return not await request_provider.is_empty()

async def __run_task_function(self) -> None: # noqa: PLR0912
async def __run_task_function(self) -> None:
request_provider = await self.get_request_provider()

request = await wait_for(
Expand All @@ -476,12 +501,14 @@ async def __run_task_function(self) -> None: # noqa: PLR0912
return

session = await self._get_session()
proxy_info = await self._get_proxy_info(request, session)
result = RequestHandlerRunResult()

crawling_context = BasicCrawlingContext(
request=request,
session=session,
send_request=self._prepare_send_request_function(session),
proxy_info=proxy_info,
send_request=self._prepare_send_request_function(session, proxy_info),
add_requests=result.add_requests,
)

Expand Down
5 changes: 4 additions & 1 deletion src/crawlee/basic_crawler/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from __future__ import annotations

import re
from collections.abc import Coroutine, Sequence
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Coroutine, Protocol, Sequence
from typing import TYPE_CHECKING, Any, Protocol

from typing_extensions import NotRequired, TypedDict, Unpack

Expand All @@ -12,6 +13,7 @@
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.http_clients.base_http_client import HttpResponse
from crawlee.models import BaseRequestData, Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions.session import Session


Expand Down Expand Up @@ -64,6 +66,7 @@ class BasicCrawlingContext:

request: Request
session: Session | None
proxy_info: ProxyInfo | None
send_request: SendRequestFunction
add_requests: AddRequestsFunction

Expand Down
9 changes: 8 additions & 1 deletion src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,17 @@ def __init__(
super().__init__(**kwargs)

async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
result = await self._http_client.crawl(context.request, context.session, self._statistics)
result = await self._http_client.crawl(
context.request,
context.session,
context.proxy_info,
self._statistics,
)

yield HttpCrawlingContext(
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
send_request=context.send_request,
add_requests=context.add_requests,
http_response=result.http_response,
Expand Down Expand Up @@ -128,6 +134,7 @@ async def enqueue_links(
yield BeautifulSoupCrawlingContext(
request=context.request,
session=context.session,
proxy_info=context.proxy_info,
send_request=context.send_request,
add_requests=context.add_requests,
enqueue_links=enqueue_links,
Expand Down
10 changes: 9 additions & 1 deletion src/crawlee/http_clients/base_http_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from httpx import Headers # Type from `httpx` is used here because it is lightweight and convenient

from crawlee.models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.sessions.session import Session
from crawlee.statistics.statistics import Statistics

Expand Down Expand Up @@ -53,12 +54,19 @@ async def crawl(
self,
request: Request,
session: Session | None,
proxy_info: ProxyInfo | None,
statistics: Statistics,
) -> HttpCrawlingResult:
"""Perform a crawl of an URL."""

@abstractmethod
async def send_request(
self, url: str, *, method: str, headers: Headers | dict[str, str], session: Session | None = None
self,
url: str,
*,
method: str,
headers: Headers | dict[str, str],
session: Session | None,
proxy_info: ProxyInfo | None,
) -> HttpResponse:
"""Perform an HTTP request."""
38 changes: 30 additions & 8 deletions src/crawlee/http_clients/httpx_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Iterable, cast
from typing import TYPE_CHECKING, Optional, cast

import httpx
from typing_extensions import override
Expand All @@ -11,9 +11,14 @@
from crawlee.sessions.session import Session

if TYPE_CHECKING:
from collections.abc import Iterable

from crawlee.models import Request
from crawlee.proxy_configuration import ProxyInfo
from crawlee.statistics.statistics import Statistics

__all__ = ['HttpxClient']


class HttpTransport(httpx.AsyncHTTPTransport):
"""A modified HTTP transport adapter that stores response cookies in a `Session` instead of the httpx client."""
Expand Down Expand Up @@ -62,11 +67,25 @@ def __init__(
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
)
self._client = httpx.AsyncClient(transport=HttpTransport())

self._client_by_proxy_url = dict[Optional[str], httpx.AsyncClient]()

def _get_client(self, proxy_url: str | None) -> httpx.AsyncClient:
if proxy_url not in self._client_by_proxy_url:
self._client_by_proxy_url[proxy_url] = httpx.AsyncClient(transport=HttpTransport(), proxy=proxy_url)

return self._client_by_proxy_url[proxy_url]

@override
async def crawl(self, request: Request, session: Session | None, statistics: Statistics) -> HttpCrawlingResult:
http_request = self._client.build_request(
async def crawl(
self,
request: Request,
session: Session | None,
proxy_info: ProxyInfo | None,
statistics: Statistics,
) -> HttpCrawlingResult:
client = self._get_client(proxy_info.url if proxy_info else None)
http_request = client.build_request(
method=request.method,
url=request.url,
headers=request.headers,
Expand All @@ -75,7 +94,7 @@ async def crawl(self, request: Request, session: Session | None, statistics: Sta
)

try:
response = await self._client.send(http_request, follow_redirects=True)
response = await client.send(http_request, follow_redirects=True)
except httpx.TransportError as e:
if _is_proxy_error(e):
raise ProxyError from e
Expand Down Expand Up @@ -110,17 +129,20 @@ async def send_request(
*,
method: str,
headers: httpx.Headers | dict[str, str],
session: Session | None = None,
session: Session | None,
proxy_info: ProxyInfo | None,
) -> HttpResponse:
http_request = self._client.build_request(
client = self._get_client(proxy_info.url if proxy_info else None)

http_request = client.build_request(
url=url,
method=method,
headers=headers,
extensions={'crawlee_session': session if self._persist_cookies_per_session else None},
)

try:
response = await self._client.send(http_request)
response = await client.send(http_request)
except httpx.TransportError as e:
if _is_proxy_error(e):
raise ProxyError from e
Expand Down
8 changes: 7 additions & 1 deletion src/crawlee/http_crawler/http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,17 @@ def __init__(
async def _make_http_request(
self, crawling_context: BasicCrawlingContext
) -> AsyncGenerator[HttpCrawlingContext, None]:
result = await self._http_client.crawl(crawling_context.request, crawling_context.session, self._statistics)
result = await self._http_client.crawl(
crawling_context.request,
crawling_context.session,
crawling_context.proxy_info,
self._statistics,
)

yield HttpCrawlingContext(
request=crawling_context.request,
session=crawling_context.session,
proxy_info=crawling_context.proxy_info,
send_request=crawling_context.send_request,
add_requests=crawling_context.add_requests,
http_response=result.http_response,
Expand Down
24 changes: 24 additions & 0 deletions src/crawlee/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,26 @@ def enqueue_strategy(self, new_enqueue_strategy: EnqueueStrategy) -> None:
self.user_data.setdefault('__crawlee', {})
self.user_data['__crawlee']['enqueueStrategy'] = str(new_enqueue_strategy)

@property
def last_proxy_tier(self) -> int | None:
"""The last proxy tier used to process the request."""
return self.crawlee_data.last_proxy_tier

@last_proxy_tier.setter
def last_proxy_tier(self, new_value: int) -> None:
self.user_data.setdefault('__crawlee', {})
self.user_data['__crawlee']['lastProxyTier'] = new_value

@property
def forefront(self) -> bool:
"""Should the request be enqueued at the start of the queue?"""
return self.crawlee_data.forefront

@forefront.setter
def forefront(self, new_value: bool) -> None:
self.user_data.setdefault('__crawlee', {})
self.user_data['__crawlee']['forefront'] = new_value


class RequestState(Enum):
"""Crawlee-specific request handling state."""
Expand Down Expand Up @@ -197,6 +217,10 @@ class CrawleeRequestData(BaseModel):

skip_navigation: Annotated[bool, Field(alias='skipNavigation')] = False

last_proxy_tier: Annotated[int | None, Field(alias='lastProxyTier')] = None

forefront: Annotated[bool, Field()] = False


class BaseStorageMetadata(BaseModel):
"""Base model for storage metadata."""
Expand Down
Loading

0 comments on commit 5c3753a

Please sign in to comment.