-
Notifications
You must be signed in to change notification settings - Fork 264
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Implement ParselCrawler that adds support for Parsel (#348)
### Description <!-- The purpose of the PR, list of the changes, ... --> - Implemented ParselCrawler which adds support for [Parsel](https://github.com/scrapy/parsel) - Added unit tests for ParselCrawler - Added example in the docs for ParselCrawler usage ### Issues <!-- If applicable, reference any related GitHub issues --> - Closes: #335 ### Testing <!-- Describe the testing process for these changes --> - Testing example included in the docs. ### Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed --------- Co-authored-by: Jan Buchar <[email protected]>
- Loading branch information
Showing
8 changed files
with
466 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
--- | ||
id: parsel-crawler | ||
title: Parsel crawler | ||
--- | ||
|
||
This example shows how to use `ParselCrawler` to crawl a website or a list of URLs. Each URL is loaded using a plain HTTP request and the response is parsed using [Parsel](https://pypi.org/project/parsel/) library which supports CSS and XPath selectors for HTML responses and JMESPath for JSON responses. We can extract data from all kinds of complex HTML structures using XPath. In this example, we will use Parsel to crawl github.com and extract page title, URL and emails found in the webpage. The default handler will scrape data from the current webpage and enqueue all the links found in the webpage for continuous scraping. | ||
|
||
|
||
```python | ||
import asyncio | ||
|
||
from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext | ||
|
||
|
||
async def main() -> None: | ||
crawler = ParselCrawler( | ||
# Limit the crawl to max requests. Remove or increase it for crawling all links. | ||
max_requests_per_crawl=10, | ||
) | ||
|
||
# Regex for identifying email addresses on a webpage. | ||
EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' | ||
|
||
# Define the default request handler, which will be called for every request. | ||
@crawler.router.default_handler | ||
async def request_handler(context: ParselCrawlingContext) -> None: | ||
context.log.info(f'Processing {context.request.url} ...') | ||
|
||
# Extract data from the page. | ||
data = { | ||
'url': context.request.url, | ||
'title': context.parsel.xpath('//title/text()').get(), | ||
'email_address_list': context.parsel.re(EMAIL_REGEX) | ||
} | ||
|
||
# Push the extracted data to the default dataset. | ||
await context.push_data(data) | ||
|
||
# Enqueue all links found on the page. | ||
await context.enqueue_links() | ||
|
||
# Run the crawler with the initial list of URLs. | ||
await crawler.run(['https://github.com']) | ||
|
||
# Export the entire dataset to a JSON file. | ||
await crawler.export_data('results.json') | ||
|
||
if __name__ == '__main__': | ||
asyncio.run(main()) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
try: | ||
from .parsel_crawler import ParselCrawler | ||
from .types import ParselCrawlingContext | ||
except ImportError as exc: | ||
raise ImportError( | ||
"To import anything from this subpackage, you need to install the 'parsel' extra." | ||
"For example, if you use pip, run `pip install 'crawlee[parsel]'`.", | ||
) from exc | ||
|
||
__all__ = ['ParselCrawler', 'ParselCrawlingContext'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
from __future__ import annotations | ||
|
||
import asyncio | ||
import logging | ||
from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable | ||
|
||
from parsel import Selector | ||
from typing_extensions import Unpack | ||
|
||
from crawlee._utils.blocked import RETRY_CSS_SELECTORS | ||
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute | ||
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, ContextPipeline | ||
from crawlee.enqueue_strategy import EnqueueStrategy | ||
from crawlee.errors import SessionError | ||
from crawlee.http_clients.httpx import HttpxHttpClient | ||
from crawlee.http_crawler import HttpCrawlingContext | ||
from crawlee.models import BaseRequestData | ||
from crawlee.parsel_crawler.types import ParselCrawlingContext | ||
|
||
if TYPE_CHECKING: | ||
from crawlee.types import AddRequestsKwargs, BasicCrawlingContext | ||
|
||
|
||
class ParselCrawler(BasicCrawler[ParselCrawlingContext]): | ||
"""A crawler that fetches the request URL using `httpx` and parses the result with `Parsel`.""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
additional_http_error_status_codes: Iterable[int] = (), | ||
ignore_http_error_status_codes: Iterable[int] = (), | ||
**kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]], | ||
) -> None: | ||
"""Initialize the ParselCrawler. | ||
Args: | ||
additional_http_error_status_codes: HTTP status codes that should be considered errors (and trigger a retry) | ||
ignore_http_error_status_codes: HTTP status codes that are normally considered errors but we want to treat | ||
them as successful | ||
kwargs: Arguments to be forwarded to the underlying BasicCrawler | ||
""" | ||
kwargs['_context_pipeline'] = ( | ||
ContextPipeline() | ||
.compose(self._make_http_request) | ||
.compose(self._parse_http_response) | ||
.compose(self._handle_blocked_request) | ||
) | ||
|
||
kwargs.setdefault( | ||
'http_client', | ||
HttpxHttpClient( | ||
additional_http_error_status_codes=additional_http_error_status_codes, | ||
ignore_http_error_status_codes=ignore_http_error_status_codes, | ||
), | ||
) | ||
|
||
kwargs.setdefault('_logger', logging.getLogger(__name__)) | ||
|
||
super().__init__(**kwargs) | ||
|
||
async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]: | ||
result = await self._http_client.crawl( | ||
request=context.request, | ||
session=context.session, | ||
proxy_info=context.proxy_info, | ||
statistics=self._statistics, | ||
) | ||
|
||
yield HttpCrawlingContext( | ||
request=context.request, | ||
session=context.session, | ||
proxy_info=context.proxy_info, | ||
add_requests=context.add_requests, | ||
send_request=context.send_request, | ||
push_data=context.push_data, | ||
log=context.log, | ||
http_response=result.http_response, | ||
) | ||
|
||
async def _handle_blocked_request( | ||
self, crawling_context: ParselCrawlingContext | ||
) -> AsyncGenerator[ParselCrawlingContext, None]: | ||
if self._retry_on_blocked: | ||
status_code = crawling_context.http_response.status_code | ||
|
||
if crawling_context.session and crawling_context.session.is_blocked_status_code(status_code=status_code): | ||
raise SessionError(f'Assuming the session is blocked based on HTTP status code {status_code}') | ||
|
||
matched_selectors = [ | ||
selector | ||
for selector in RETRY_CSS_SELECTORS | ||
if crawling_context.selector.css(selector).get() is not None | ||
] | ||
|
||
if matched_selectors: | ||
raise SessionError( | ||
'Assuming the session is blocked - ' | ||
f"HTTP response matched the following selectors: {'; '.join(matched_selectors)}" | ||
) | ||
|
||
yield crawling_context | ||
|
||
async def _parse_http_response( | ||
self, | ||
context: HttpCrawlingContext, | ||
) -> AsyncGenerator[ParselCrawlingContext, None]: | ||
parsel_selector = await asyncio.to_thread(lambda: Selector(body=context.http_response.read())) | ||
|
||
async def enqueue_links( | ||
*, | ||
selector: str = 'a', | ||
label: str | None = None, | ||
user_data: dict[str, Any] | None = None, | ||
**kwargs: Unpack[AddRequestsKwargs], | ||
) -> None: | ||
kwargs.setdefault('strategy', EnqueueStrategy.SAME_HOSTNAME) | ||
|
||
requests = list[BaseRequestData]() | ||
user_data = user_data or {} | ||
|
||
link: Selector | ||
for link in parsel_selector.css(selector): | ||
link_user_data = user_data | ||
|
||
if label is not None: | ||
link_user_data.setdefault('label', label) | ||
|
||
if (url := link.xpath('@href').get()) is not None: | ||
url = url.strip() | ||
|
||
if not is_url_absolute(url): | ||
url = str(convert_to_absolute_url(context.request.url, url)) | ||
|
||
requests.append(BaseRequestData.from_url(url, user_data=link_user_data)) | ||
|
||
await context.add_requests(requests, **kwargs) | ||
|
||
yield ParselCrawlingContext( | ||
request=context.request, | ||
session=context.session, | ||
proxy_info=context.proxy_info, | ||
enqueue_links=enqueue_links, | ||
add_requests=context.add_requests, | ||
send_request=context.send_request, | ||
push_data=context.push_data, | ||
log=context.log, | ||
http_response=context.http_response, | ||
selector=parsel_selector, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
from crawlee.http_crawler import HttpCrawlingResult | ||
from crawlee.types import BasicCrawlingContext, EnqueueLinksFunction | ||
|
||
if TYPE_CHECKING: | ||
from parsel import Selector | ||
|
||
|
||
@dataclass(frozen=True) | ||
class ParselCrawlingContext(HttpCrawlingResult, BasicCrawlingContext): | ||
"""Crawling context used by ParselCrawler.""" | ||
|
||
selector: Selector | ||
enqueue_links: EnqueueLinksFunction |
Oops, something went wrong.