-
Notifications
You must be signed in to change notification settings - Fork 264
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add first version of browser pool and playwright crawler (#161)
### Description - Introduce the initial version of `BrowserPool` and `PlaywrightCrawler`. - It lacks several features - fingerprinting, hooks, managing multiple instances of browsers, browser abstraction, ... - Those will be done later, see #131. - `BrowserPool` is responsible for managing browser-related resources, but currently, it only supports handling a single browser instance through the plugin. - Also a very first version of `PlaywrightCrawler` is introduced, primarily to enhance testing and to provide a clear view of the end-user interface and results. ### Related issues - #79 ### Testing - Unit tests for new modules were written. - For ad-hoc test code samples see `README.md`. ### TODO - [x] playwright install in CI - [x] update related issues
- Loading branch information
Showing
25 changed files
with
744 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from .basic_crawler import BasicCrawler, UserDefinedErrorHandlerError | ||
from .context_pipeline import BasicCrawlingContext, ContextPipeline, RequestHandlerError | ||
from .basic_crawler import BasicCrawler, BasicCrawlerOptions | ||
from .context_pipeline import ContextPipeline | ||
from .errors import RequestHandlerError, UserDefinedErrorHandlerError | ||
from .router import Router | ||
from .types import BasicCrawlingContext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Generic | ||
|
||
from typing_extensions import TypeVar | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from .beautifulsoup_crawler import BeautifulSoupCrawler, BeautifulSoupCrawlingContext | ||
from .beautifulsoup_crawler import BeautifulSoupCrawler | ||
from .types import BeautifulSoupCrawlingContext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .browser_pool import BrowserPool | ||
from .playwright_browser_plugin import PlaywrightBrowserPlugin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Inspiration: https://github.com/apify/crawlee/blob/v3.10.0/packages/browser-pool/src/abstract-classes/browser-plugin.ts | ||
|
||
from __future__ import annotations | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import TYPE_CHECKING, Literal | ||
|
||
if TYPE_CHECKING: | ||
from types import TracebackType | ||
|
||
from playwright.async_api import Browser, Page | ||
|
||
|
||
class BaseBrowserPlugin(ABC): | ||
"""An abstract base class for browser plugins. | ||
Browser plugins act as wrappers around browser automation tools like Playwright, | ||
providing a unified interface for interacting with browsers. | ||
""" | ||
|
||
@property | ||
@abstractmethod | ||
def browser(self) -> Browser | None: | ||
"""Return the browser instance.""" | ||
|
||
@property | ||
@abstractmethod | ||
def browser_type(self) -> Literal['chromium', 'firefox', 'webkit']: | ||
"""Return the browser type name.""" | ||
|
||
@abstractmethod | ||
async def __aenter__(self) -> BaseBrowserPlugin: | ||
"""Enter the context manager and initialize the browser plugin.""" | ||
|
||
@abstractmethod | ||
async def __aexit__( | ||
self, | ||
exc_type: type[BaseException] | None, | ||
exc_value: BaseException | None, | ||
exc_traceback: TracebackType | None, | ||
) -> None: | ||
"""Exit the context manager and close the browser plugin.""" | ||
|
||
@abstractmethod | ||
async def new_page(self) -> Page: | ||
"""Get a new page in a browser.""" |
Oops, something went wrong.