From 200ebfa63d6e20e17c8ca29544ef7229ed0df308 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 31 May 2024 17:22:34 +0200 Subject: [PATCH] fix: add explicit err msgs for missing pckg extras during import (#165) ## Description - Add explicit error messages for missing package extras during import. - It applies to BrowserPool, BeautifulsoupCrawler, and PlaywrightCrawler. - Also until now the Playwright was mandatory for even `BasicCrawler`, fixing it. ## Related issues - Closes #155 ## Testing ### PlaywrightCrawler ``` ImportError Traceback (most recent call last) Cell In[3], line 1 ----> 1 from crawlee.playwright_crawler import PlaywrightCrawler File ~/Projects/crawlee-py/src/crawlee/playwright_crawler/__init__.py:5 3 from .types import PlaywrightCrawlingContext 4 except ImportError as exc: ----> 5 raise ImportError( 6 'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".', 7 ) from exc ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]". ``` ### BeautifulsoupCrawler ``` ImportError Traceback (most recent call last) Cell In[1], line 1 ----> 1 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler File ~/Projects/crawlee-py/src/crawlee/beautifulsoup_crawler/__init__.py:5 3 from .types import BeautifulSoupCrawlingContext 4 except ImportError as exc: ----> 5 raise ImportError( 6 'To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]".', 7 ) from exc ImportError: To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]". ``` ### BrowserPool ``` ImportError Traceback (most recent call last) Cell In[2], line 1 ----> 1 from crawlee.browsers import BrowserPool File ~/Projects/crawlee-py/src/crawlee/browsers/__init__.py:5 3 from .playwright_browser_plugin import PlaywrightBrowserPlugin 4 except ImportError as exc: ----> 5 raise ImportError( 6 'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".', 7 ) from exc ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]". ``` ## Checklist - [x] Changes are described in the `CHANGELOG.md` - [x] CI passed --- CHANGELOG.md | 4 +- pyproject.toml | 3 ++ src/crawlee/basic_crawler/basic_crawler.py | 30 +++++--------- src/crawlee/beautifulsoup_crawler/__init__.py | 10 ++++- .../beautifulsoup_crawler.py | 3 +- src/crawlee/browsers/__init__.py | 10 ++++- src/crawlee/browsers/browser_pool.py | 29 +++++++++++++- src/crawlee/playwright_crawler/__init__.py | 10 ++++- .../playwright_crawler/playwright_crawler.py | 39 +++++++------------ 9 files changed, 83 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dddded82d..7f362931c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,9 @@ ## [0.0.5](../../releases/tag/v0.0.5) - Unreleased -- ... +### Adds + +- Add explicit error messages for missing package extras during import ## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30 diff --git a/pyproject.toml b/pyproject.toml index 1cb4ccd7b..db1230ce9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -208,3 +208,6 @@ reportUnusedVariable = false reportCallInDefaultInitializer = false reportImplicitStringConcatenation = false reportAny = false + +[tool.ipdb] +context = 7 diff --git a/src/crawlee/basic_crawler/basic_crawler.py b/src/crawlee/basic_crawler/basic_crawler.py index 3cfd77bca..e5eb67377 100644 --- a/src/crawlee/basic_crawler/basic_crawler.py +++ b/src/crawlee/basic_crawler/basic_crawler.py @@ -7,7 +7,7 @@ from datetime import timedelta from functools import partial from logging import getLogger -from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast +from typing import TYPE_CHECKING, Any, AsyncContextManager, Callable, Generic, Union, cast import httpx from tldextract import TLDExtract @@ -18,9 +18,7 @@ from crawlee.autoscaling import AutoscaledPool, ConcurrencySettings from crawlee.autoscaling.snapshotter import Snapshotter from crawlee.autoscaling.system_status import SystemStatus -from crawlee.basic_crawler.context_pipeline import ( - ContextPipeline, -) +from crawlee.basic_crawler.context_pipeline import ContextPipeline from crawlee.basic_crawler.errors import ( ContextPipelineInitializationError, ContextPipelineInterruptedError, @@ -29,12 +27,7 @@ UserDefinedErrorHandlerError, ) from crawlee.basic_crawler.router import Router -from crawlee.basic_crawler.types import ( - BasicCrawlingContext, - RequestHandlerRunResult, - SendRequestFunction, -) -from crawlee.browsers import BrowserPool +from crawlee.basic_crawler.types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction from crawlee.configuration import Configuration from crawlee.enqueue_strategy import EnqueueStrategy from crawlee.events.local_event_manager import LocalEventManager @@ -76,9 +69,8 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]): retry_on_blocked: NotRequired[bool] proxy_configuration: NotRequired[ProxyConfiguration] statistics: NotRequired[Statistics[StatisticsState]] - browser_pool: NotRequired[BrowserPool] - use_browser_pool: NotRequired[bool] _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]] + _additional_context_managers: NotRequired[Sequence[AsyncContextManager]] class BasicCrawler(Generic[TCrawlingContext]): @@ -108,9 +100,8 @@ def __init__( retry_on_blocked: bool = True, proxy_configuration: ProxyConfiguration | None = None, statistics: Statistics | None = None, - browser_pool: BrowserPool | None = None, - use_browser_pool: bool = False, _context_pipeline: ContextPipeline[TCrawlingContext] | None = None, + _additional_context_managers: Sequence[AsyncContextManager] | None = None, ) -> None: """Initialize the BasicCrawler. @@ -131,9 +122,9 @@ def __init__( proxy_configuration: A HTTP proxy configuration to be used for making requests statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration browser_pool: A preconfigured `BrowserPool` instance for browser crawling. - use_browser_pool: Enables using the browser pool for crawling. _context_pipeline: Allows extending the request lifecycle and modifying the crawling context. This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly. + _additional_context_managers: Additional context managers to be used in the crawler lifecycle. """ self._router: Router[TCrawlingContext] | None = None @@ -186,10 +177,7 @@ def __init__( event_manager=self._event_manager, log_message=f'{logger.name} request statistics', ) - - self._use_browser_pool = use_browser_pool - if self._use_browser_pool: - self._browser_pool = browser_pool or BrowserPool() + self._additional_context_managers = _additional_context_managers or [] self._running = False self._has_finished_before = False @@ -304,8 +292,8 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina if self._use_session_pool: await exit_stack.enter_async_context(self._session_pool) - if self._use_browser_pool: - await exit_stack.enter_async_context(self._browser_pool) + for context_manager in self._additional_context_managers: + await exit_stack.enter_async_context(context_manager) await self._pool.run() diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py index 13b1f7c47..29dbb443d 100644 --- a/src/crawlee/beautifulsoup_crawler/__init__.py +++ b/src/crawlee/beautifulsoup_crawler/__init__.py @@ -1,2 +1,8 @@ -from .beautifulsoup_crawler import BeautifulSoupCrawler -from .types import BeautifulSoupCrawlingContext +try: + from .beautifulsoup_crawler import BeautifulSoupCrawler + from .types import BeautifulSoupCrawlingContext +except ImportError as exc: + raise ImportError( + 'To import anything from this subpacakge, you need to install the "beautifulsoup" extra. ' + 'For example, if you use pip, run "pip install crawlee[beautifulsoup]".', + ) from exc diff --git a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py index 0f9cfafb8..4ce61eee1 100644 --- a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py +++ b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py @@ -3,6 +3,7 @@ import asyncio from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable, Literal +from bs4 import BeautifulSoup, Tag from typing_extensions import Unpack from crawlee._utils.blocked import RETRY_CSS_SELECTORS @@ -102,8 +103,6 @@ async def _handle_blocked_request( async def _parse_http_response( self, context: HttpCrawlingContext ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]: - from bs4 import BeautifulSoup, Tag - soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser)) async def enqueue_links( diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py index 4713b4eaf..8d4d642e2 100644 --- a/src/crawlee/browsers/__init__.py +++ b/src/crawlee/browsers/__init__.py @@ -1,2 +1,8 @@ -from .browser_pool import BrowserPool -from .playwright_browser_plugin import PlaywrightBrowserPlugin +try: + from .browser_pool import BrowserPool + from .playwright_browser_plugin import PlaywrightBrowserPlugin +except ImportError as exc: + raise ImportError( + 'To import anything from this subpacakge, you need to install the "playwright" extra. ' + 'For example, if you use pip, run "pip install crawlee[playwright]".', + ) from exc diff --git a/src/crawlee/browsers/browser_pool.py b/src/crawlee/browsers/browser_pool.py index ca32616b3..35a1932ba 100644 --- a/src/crawlee/browsers/browser_pool.py +++ b/src/crawlee/browsers/browser_pool.py @@ -4,9 +4,10 @@ import asyncio import itertools +from collections import defaultdict from datetime import timedelta from logging import getLogger -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Literal from weakref import WeakValueDictionary from crawlee._utils.crypto import crypto_random_object_id @@ -55,6 +56,32 @@ def __init__( self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins + @classmethod + def with_default_plugin( + cls, + *, + headless: bool | None = None, + browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None, + **kwargs: Any, + ) -> BrowserPool: + """Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options. + + Args: + headless: Whether to run the browser in headless mode. + browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit'). + kwargs: Additional arguments for default constructor. + """ + plugin_options: dict = defaultdict(dict) + + if headless is not None: + plugin_options['browser_options']['headless'] = headless + + if browser_type: + plugin_options['browser_type'] = browser_type + + plugin = PlaywrightBrowserPlugin(**plugin_options) + return cls(plugins=[plugin], **kwargs) + @property def plugins(self) -> Sequence[BaseBrowserPlugin]: """Return the browser plugins.""" diff --git a/src/crawlee/playwright_crawler/__init__.py b/src/crawlee/playwright_crawler/__init__.py index 79103680c..38449256f 100644 --- a/src/crawlee/playwright_crawler/__init__.py +++ b/src/crawlee/playwright_crawler/__init__.py @@ -1,2 +1,8 @@ -from .playwright_crawler import PlaywrightCrawler -from .types import PlaywrightCrawlingContext +try: + from .playwright_crawler import PlaywrightCrawler + from .types import PlaywrightCrawlingContext +except ImportError as exc: + raise ImportError( + 'To import anything from this subpacakge, you need to install the "playwright" extra. ' + 'For example, if you use pip, run "pip install crawlee[playwright]".', + ) from exc diff --git a/src/crawlee/playwright_crawler/playwright_crawler.py b/src/crawlee/playwright_crawler/playwright_crawler.py index de840fc37..c173dab3c 100644 --- a/src/crawlee/playwright_crawler/playwright_crawler.py +++ b/src/crawlee/playwright_crawler/playwright_crawler.py @@ -1,17 +1,11 @@ from __future__ import annotations -from collections import defaultdict from typing import TYPE_CHECKING, Literal from typing_extensions import Unpack -from crawlee.basic_crawler import ( - BasicCrawler, - BasicCrawlerOptions, - BasicCrawlingContext, - ContextPipeline, -) -from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin +from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline +from crawlee.browsers import BrowserPool from crawlee.playwright_crawler.types import PlaywrightCrawlingContext if TYPE_CHECKING: @@ -23,21 +17,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]): def __init__( self, - headless: bool | None = None, + browser_pool: BrowserPool | None = None, browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None, + headless: bool | None = None, **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]], ) -> None: """Create a new instance. Args: - headless: Whether to run the browser in headless mode. - This option should not be used if `browser_pool` is provided. + browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages. browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit'). This option should not be used if `browser_pool` is provided. - kwargs: Additional arguments to be forwarded to the underlying BasicCrawler. + headless: Whether to run the browser in headless mode. + This option should not be used if `browser_pool` is provided. + kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`. """ - browser_pool = kwargs.get('browser_pool') - if browser_pool: # Raise an exception if browser_pool is provided together with headless or browser_type arguments. if headless is not None or browser_type is not None: @@ -47,25 +41,22 @@ def __init__( # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments. else: - plugin_options: dict = defaultdict(dict) - - if headless is not None: - plugin_options['browser_options']['headless'] = headless - - if browser_type: - plugin_options['browser_type'] = browser_type + browser_pool = BrowserPool.with_default_plugin(headless=headless, browser_type=browser_type) - browser_pool = BrowserPool(plugins=[PlaywrightBrowserPlugin(**plugin_options)]) - kwargs['browser_pool'] = browser_pool + self._browser_pool = browser_pool - kwargs['use_browser_pool'] = True kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto) + kwargs['_additional_context_managers'] = [self._browser_pool] + super().__init__(**kwargs) async def _page_goto( self, context: BasicCrawlingContext, ) -> AsyncGenerator[PlaywrightCrawlingContext, None]: + if self._browser_pool is None: + raise ValueError('Browser pool is not initialized.') + crawlee_page = await self._browser_pool.new_page() await crawlee_page.page.goto(context.request.url) context.request.loaded_url = crawlee_page.page.url