feat: add first version of browser pool and playwright crawler (#161)

### Description - Introduce the initial version of `BrowserPool` and `PlaywrightCrawler`. - It lacks several features - fingerprinting, hooks, managing multiple instances of browsers, browser abstraction, ... - Those will be done later, see #131. - `BrowserPool` is responsible for managing browser-related resources, but currently, it only supports handling a single browser instance through the plugin. - Also a very first version of `PlaywrightCrawler` is introduced, primarily to enhance testing and to provide a clear view of the end-user interface and results. ### Related issues - #79 ### Testing - Unit tests for new modules were written. - For ad-hoc test code samples see `README.md`. ### TODO - [x] playwright install in CI - [x] update related issues
apify · May 29, 2024 · 2d2a050 · 2d2a050
1 parent 5c3753a
commit 2d2a050
Show file tree

Hide file tree

Showing 25 changed files with 744 additions and 8 deletions.
diff --git a/Makefile b/Makefile
@@ -12,6 +12,7 @@ install-dev:
 	python3 -m pip install --upgrade pip poetry
 	poetry install --all-extras
 	poetry run pre-commit install
+	poetry run playwright install
 
 build:
 	poetry build --no-interaction -vv

diff --git a/README.md b/README.md
@@ -206,7 +206,108 @@ from crawlee.enqueue_strategy import EnqueueStrategy
 
 #### PlaywrightCrawler
 
-- TODO
+[`PlaywrightCrawler`](https://github.com/apify/crawlee-py/tree/master/src/crawlee/playwright_crawler) extends
+the `BasicCrawler`. It provides the same features and on top of that, it uses
+[Playwright](https://playwright.dev/python) browser automation tool.
+
+This crawler provides a straightforward framework for parallel web page crawling using headless versions of Chromium,
+Firefox, and Webkit browsers through Playwright. URLs to be crawled are supplied by a request provider, which can be
+either a `RequestList` containing a static list of URLs or a dynamic `RequestQueue`.
+
+Using a headless browser to download web pages and extract data, `PlaywrightCrawler` is ideal for crawling
+websites that require JavaScript execution. For websites that do not require JavaScript, consider using
+the `BeautifulSoupCrawler`, which utilizes raw HTTP requests and will be much faster.
+
+Example usage:
+
+```python
+import asyncio
+
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.storages import Dataset, RequestQueue
+
+
+async def main() -> None:
+    # Open a default request queue and add requests to it
+    rq = await RequestQueue.open()
+    await rq.add_request('https://crawlee.dev')
+
+    # Open a default dataset for storing results
+    dataset = await Dataset.open()
+
+    # Create a crawler instance and provide a request provider (and other optional arguments)
+    crawler = PlaywrightCrawler(
+        request_provider=rq,
+        # headless=False,
+        # browser_type='firefox',
+    )
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        record = {
+            'request_url': context.request.url,
+            'page_url': context.page.url,
+            'page_title': await context.page.title(),
+            'page_content': (await context.page.content())[:10000],
+        }
+        await dataset.push_data(record)
+
+    await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
+
+Example usage with custom browser pool:
+
+```python
+import asyncio
+
+from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
+from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
+from crawlee.storages import Dataset, RequestQueue
+
+
+async def main() -> None:
+    # Open a default request queue and add requests to it
+    rq = await RequestQueue.open()
+    await rq.add_request('https://crawlee.dev')
+    await rq.add_request('https://apify.com')
+
+    # Open a default dataset for storing results
+    dataset = await Dataset.open()
+
+    # Create a browser pool with a Playwright browser plugin
+    browser_pool = BrowserPool(
+        plugins=[
+            PlaywrightBrowserPlugin(
+                browser_type='firefox',
+                browser_options={'headless': False},
+                page_options={'viewport': {'width': 1920, 'height': 1080}},
+            )
+        ]
+    )
+
+    # Create a crawler instance and provide a browser pool and request provider
+    crawler = PlaywrightCrawler(request_provider=rq, browser_pool=browser_pool)
+
+    @crawler.router.default_handler
+    async def request_handler(context: PlaywrightCrawlingContext) -> None:
+        record = {
+            'request_url': context.request.url,
+            'page_url': context.page.url,
+            'page_title': await context.page.title(),
+            'page_content': (await context.page.content())[:10000],
+        }
+        await dataset.push_data(record)
+
+    await crawler.run()
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+```
 
 ### Storages
 
@@ -416,6 +517,14 @@ if __name__ == '__main__':
     asyncio.run(main())
 ```
 
+<!--
+### Browser Management
+
+- TODO
+- Write once browser rotation and/or other features are ready
+- Update PlaywrightCrawler according to this
+-->
+
 ## Running on the Apify platform
 
 Crawlee is open-source and runs anywhere, but since it's developed by [Apify](https://apify.com), it's easy to set up on the Apify platform and run in the cloud. Visit the [Apify SDK website](https://sdk.apify.com) to learn more about deploying Crawlee to the Apify platform.

diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,7 @@ html5lib = { version = "^1.1", optional = true }
 httpx = "^0.27.0"
 lxml = { version = "^5.2.1", optional = true }
 more_itertools = "^10.2.0"
+playwright = { version = "^1.43.0", optional = true }
 psutil = "^5.9.8"
 pydantic = "^2.6.3"
 pydantic-settings = "^2.2.1"
@@ -78,6 +79,7 @@ pytest-timeout = "~2.3.0"
 pytest-xdist = "~3.6.0"
 respx = "~0.21.0"
 ruff = "~0.4.0"
+setuptools = "^70.0.0"  # setuptools are used by pytest, but not explicitly required
 types-aiofiles = "^23.2.0.20240106"
 types-beautifulsoup4 = "^4.12.0.20240229"
 types-colorama = "~0.4.15.20240106"
@@ -87,6 +89,7 @@ proxy-py = "^2.4.4"
 
 [tool.poetry.extras]
 beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
+playwright = ["playwright"]
 
 [tool.ruff]
 line-length = 120

diff --git a/src/crawlee/_utils/blocked.py b/src/crawlee/_utils/blocked.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 # Inspiration: https://github.com/apify/crawlee/blob/v3.9.2/packages/utils/src/internals/blocked.ts
 
 CLOUDFLARE_RETRY_CSS_SELECTORS = [

diff --git a/src/crawlee/basic_crawler/__init__.py b/src/crawlee/basic_crawler/__init__.py
@@ -1,3 +1,5 @@
-from .basic_crawler import BasicCrawler, UserDefinedErrorHandlerError
-from .context_pipeline import BasicCrawlingContext, ContextPipeline, RequestHandlerError
+from .basic_crawler import BasicCrawler, BasicCrawlerOptions
+from .context_pipeline import ContextPipeline
+from .errors import RequestHandlerError, UserDefinedErrorHandlerError
 from .router import Router
+from .types import BasicCrawlingContext
diff --git a/src/crawlee/basic_crawler/basic_crawler.py b/src/crawlee/basic_crawler/basic_crawler.py
@@ -34,6 +34,7 @@
     RequestHandlerRunResult,
     SendRequestFunction,
 )
+from crawlee.browsers import BrowserPool
 from crawlee.configuration import Configuration
 from crawlee.enqueue_strategy import EnqueueStrategy
 from crawlee.events.local_event_manager import LocalEventManager
@@ -75,6 +76,8 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
     retry_on_blocked: NotRequired[bool]
     proxy_configuration: NotRequired[ProxyConfiguration]
     statistics: NotRequired[Statistics[StatisticsState]]
+    browser_pool: NotRequired[BrowserPool]
+    use_browser_pool: NotRequired[bool]
     _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
 
 
@@ -105,6 +108,8 @@ def __init__(
         retry_on_blocked: bool = True,
         proxy_configuration: ProxyConfiguration | None = None,
         statistics: Statistics | None = None,
+        browser_pool: BrowserPool | None = None,
+        use_browser_pool: bool = False,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
     ) -> None:
         """Initialize the BasicCrawler.
@@ -125,6 +130,8 @@ def __init__(
             retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
             proxy_configuration: A HTTP proxy configuration to be used for making requests
             statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
+            browser_pool: A preconfigured `BrowserPool` instance for browser crawling.
+            use_browser_pool: Enables using the browser pool for crawling.
             _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
                 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
         """
@@ -180,6 +187,10 @@ def __init__(
             log_message=f'{logger.name} request statistics',
         )
 
+        self._use_browser_pool = use_browser_pool
+        if self._use_browser_pool:
+            self._browser_pool = browser_pool or BrowserPool()
+
         self._running = False
         self._has_finished_before = False
 
@@ -293,6 +304,9 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
             if self._use_session_pool:
                 await exit_stack.enter_async_context(self._session_pool)
 
+            if self._use_browser_pool:
+                await exit_stack.enter_async_context(self._browser_pool)
+
             await self._pool.run()
 
         if self._statistics.error_tracker.total > 0:

diff --git a/src/crawlee/basic_crawler/errors.py b/src/crawlee/basic_crawler/errors.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing import Generic
 
 from typing_extensions import TypeVar

diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1 +1,2 @@
-from .beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
+from .beautifulsoup_crawler import BeautifulSoupCrawler
+from .types import BeautifulSoupCrawlingContext
diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py
@@ -0,0 +1,2 @@
+from .browser_pool import BrowserPool
+from .playwright_browser_plugin import PlaywrightBrowserPlugin
diff --git a/src/crawlee/browsers/base_browser_plugin.py b/src/crawlee/browsers/base_browser_plugin.py
@@ -0,0 +1,46 @@
+# Inspiration: https://github.com/apify/crawlee/blob/v3.10.0/packages/browser-pool/src/abstract-classes/browser-plugin.ts
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    from types import TracebackType
+
+    from playwright.async_api import Browser, Page
+
+
+class BaseBrowserPlugin(ABC):
+    """An abstract base class for browser plugins.
+
+    Browser plugins act as wrappers around browser automation tools like Playwright,
+    providing a unified interface for interacting with browsers.
+    """
+
+    @property
+    @abstractmethod
+    def browser(self) -> Browser | None:
+        """Return the browser instance."""
+
+    @property
+    @abstractmethod
+    def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']:
+        """Return the browser type name."""
+
+    @abstractmethod
+    async def __aenter__(self) -> BaseBrowserPlugin:
+        """Enter the context manager and initialize the browser plugin."""
+
+    @abstractmethod
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        exc_traceback: TracebackType | None,
+    ) -> None:
+        """Exit the context manager and close the browser plugin."""
+
+    @abstractmethod
+    async def new_page(self) -> Page:
+        """Get a new page in a browser."""