From 200ebfa63d6e20e17c8ca29544ef7229ed0df308 Mon Sep 17 00:00:00 2001
From: Vlada Dusek <v.dusek96@gmail.com>
Date: Fri, 31 May 2024 17:22:34 +0200
Subject: [PATCH] fix: add explicit err msgs for missing pckg extras during
 import (#165)

## Description

- Add explicit error messages for missing package extras during import.
- It applies to BrowserPool, BeautifulsoupCrawler, and
PlaywrightCrawler.
- Also until now the Playwright was mandatory for even `BasicCrawler`,
fixing it.

## Related issues

- Closes #155

## Testing

### PlaywrightCrawler

```
ImportError                               Traceback (most recent call last)
Cell In[3], line 1
----> 1 from crawlee.playwright_crawler import PlaywrightCrawler

File ~/Projects/crawlee-py/src/crawlee/playwright_crawler/__init__.py:5
      3     from .types import PlaywrightCrawlingContext
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".',
      7     ) from exc

ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".
```

### BeautifulsoupCrawler

```
ImportError                               Traceback (most recent call last)
Cell In[1], line 1
----> 1 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler

File ~/Projects/crawlee-py/src/crawlee/beautifulsoup_crawler/__init__.py:5
      3     from .types import BeautifulSoupCrawlingContext
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]".',
      7     ) from exc

ImportError: To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]".
```

### BrowserPool

```
ImportError                               Traceback (most recent call last)
Cell In[2], line 1
----> 1 from crawlee.browsers import BrowserPool

File ~/Projects/crawlee-py/src/crawlee/browsers/__init__.py:5
      3     from .playwright_browser_plugin import PlaywrightBrowserPlugin
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".',
      7     ) from exc

ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".
```

## Checklist

- [x] Changes are described in the `CHANGELOG.md`
- [x] CI passed
---
 CHANGELOG.md                                  |  4 +-
 pyproject.toml                                |  3 ++
 src/crawlee/basic_crawler/basic_crawler.py    | 30 +++++---------
 src/crawlee/beautifulsoup_crawler/__init__.py | 10 ++++-
 .../beautifulsoup_crawler.py                  |  3 +-
 src/crawlee/browsers/__init__.py              | 10 ++++-
 src/crawlee/browsers/browser_pool.py          | 29 +++++++++++++-
 src/crawlee/playwright_crawler/__init__.py    | 10 ++++-
 .../playwright_crawler/playwright_crawler.py  | 39 +++++++------------
 9 files changed, 83 insertions(+), 55 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dddded82d..7f362931c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,9 @@
 
 ## [0.0.5](../../releases/tag/v0.0.5) - Unreleased
 
-- ...
+### Adds
+
+- Add explicit error messages for missing package extras during import
 
 ## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30
 
diff --git a/pyproject.toml b/pyproject.toml
index 1cb4ccd7b..db1230ce9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -208,3 +208,6 @@ reportUnusedVariable = false
 reportCallInDefaultInitializer = false
 reportImplicitStringConcatenation = false
 reportAny = false
+
+[tool.ipdb]
+context = 7
diff --git a/src/crawlee/basic_crawler/basic_crawler.py b/src/crawlee/basic_crawler/basic_crawler.py
index 3cfd77bca..e5eb67377 100644
--- a/src/crawlee/basic_crawler/basic_crawler.py
+++ b/src/crawlee/basic_crawler/basic_crawler.py
@@ -7,7 +7,7 @@
 from datetime import timedelta
 from functools import partial
 from logging import getLogger
-from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
+from typing import TYPE_CHECKING, Any, AsyncContextManager, Callable, Generic, Union, cast
 
 import httpx
 from tldextract import TLDExtract
@@ -18,9 +18,7 @@
 from crawlee.autoscaling import AutoscaledPool, ConcurrencySettings
 from crawlee.autoscaling.snapshotter import Snapshotter
 from crawlee.autoscaling.system_status import SystemStatus
-from crawlee.basic_crawler.context_pipeline import (
-    ContextPipeline,
-)
+from crawlee.basic_crawler.context_pipeline import ContextPipeline
 from crawlee.basic_crawler.errors import (
     ContextPipelineInitializationError,
     ContextPipelineInterruptedError,
@@ -29,12 +27,7 @@
     UserDefinedErrorHandlerError,
 )
 from crawlee.basic_crawler.router import Router
-from crawlee.basic_crawler.types import (
-    BasicCrawlingContext,
-    RequestHandlerRunResult,
-    SendRequestFunction,
-)
-from crawlee.browsers import BrowserPool
+from crawlee.basic_crawler.types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
 from crawlee.configuration import Configuration
 from crawlee.enqueue_strategy import EnqueueStrategy
 from crawlee.events.local_event_manager import LocalEventManager
@@ -76,9 +69,8 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
     retry_on_blocked: NotRequired[bool]
     proxy_configuration: NotRequired[ProxyConfiguration]
     statistics: NotRequired[Statistics[StatisticsState]]
-    browser_pool: NotRequired[BrowserPool]
-    use_browser_pool: NotRequired[bool]
     _context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
+    _additional_context_managers: NotRequired[Sequence[AsyncContextManager]]
 
 
 class BasicCrawler(Generic[TCrawlingContext]):
@@ -108,9 +100,8 @@ def __init__(
         retry_on_blocked: bool = True,
         proxy_configuration: ProxyConfiguration | None = None,
         statistics: Statistics | None = None,
-        browser_pool: BrowserPool | None = None,
-        use_browser_pool: bool = False,
         _context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
+        _additional_context_managers: Sequence[AsyncContextManager] | None = None,
     ) -> None:
         """Initialize the BasicCrawler.
 
@@ -131,9 +122,9 @@ def __init__(
             proxy_configuration: A HTTP proxy configuration to be used for making requests
             statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
             browser_pool: A preconfigured `BrowserPool` instance for browser crawling.
-            use_browser_pool: Enables using the browser pool for crawling.
             _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
                 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
+            _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
         """
         self._router: Router[TCrawlingContext] | None = None
 
@@ -186,10 +177,7 @@ def __init__(
             event_manager=self._event_manager,
             log_message=f'{logger.name} request statistics',
         )
-
-        self._use_browser_pool = use_browser_pool
-        if self._use_browser_pool:
-            self._browser_pool = browser_pool or BrowserPool()
+        self._additional_context_managers = _additional_context_managers or []
 
         self._running = False
         self._has_finished_before = False
@@ -304,8 +292,8 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
             if self._use_session_pool:
                 await exit_stack.enter_async_context(self._session_pool)
 
-            if self._use_browser_pool:
-                await exit_stack.enter_async_context(self._browser_pool)
+            for context_manager in self._additional_context_managers:
+                await exit_stack.enter_async_context(context_manager)
 
             await self._pool.run()
 
diff --git a/src/crawlee/beautifulsoup_crawler/__init__.py b/src/crawlee/beautifulsoup_crawler/__init__.py
index 13b1f7c47..29dbb443d 100644
--- a/src/crawlee/beautifulsoup_crawler/__init__.py
+++ b/src/crawlee/beautifulsoup_crawler/__init__.py
@@ -1,2 +1,8 @@
-from .beautifulsoup_crawler import BeautifulSoupCrawler
-from .types import BeautifulSoupCrawlingContext
+try:
+    from .beautifulsoup_crawler import BeautifulSoupCrawler
+    from .types import BeautifulSoupCrawlingContext
+except ImportError as exc:
+    raise ImportError(
+        'To import anything from this subpacakge, you need to install the "beautifulsoup" extra. '
+        'For example, if you use pip, run "pip install crawlee[beautifulsoup]".',
+    ) from exc
diff --git a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
index 0f9cfafb8..4ce61eee1 100644
--- a/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
+++ b/src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
@@ -3,6 +3,7 @@
 import asyncio
 from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable, Literal
 
+from bs4 import BeautifulSoup, Tag
 from typing_extensions import Unpack
 
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
@@ -102,8 +103,6 @@ async def _handle_blocked_request(
     async def _parse_http_response(
         self, context: HttpCrawlingContext
     ) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
-        from bs4 import BeautifulSoup, Tag
-
         soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))
 
         async def enqueue_links(
diff --git a/src/crawlee/browsers/__init__.py b/src/crawlee/browsers/__init__.py
index 4713b4eaf..8d4d642e2 100644
--- a/src/crawlee/browsers/__init__.py
+++ b/src/crawlee/browsers/__init__.py
@@ -1,2 +1,8 @@
-from .browser_pool import BrowserPool
-from .playwright_browser_plugin import PlaywrightBrowserPlugin
+try:
+    from .browser_pool import BrowserPool
+    from .playwright_browser_plugin import PlaywrightBrowserPlugin
+except ImportError as exc:
+    raise ImportError(
+        'To import anything from this subpacakge, you need to install the "playwright" extra. '
+        'For example, if you use pip, run "pip install crawlee[playwright]".',
+    ) from exc
diff --git a/src/crawlee/browsers/browser_pool.py b/src/crawlee/browsers/browser_pool.py
index ca32616b3..35a1932ba 100644
--- a/src/crawlee/browsers/browser_pool.py
+++ b/src/crawlee/browsers/browser_pool.py
@@ -4,9 +4,10 @@
 
 import asyncio
 import itertools
+from collections import defaultdict
 from datetime import timedelta
 from logging import getLogger
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 from weakref import WeakValueDictionary
 
 from crawlee._utils.crypto import crypto_random_object_id
@@ -55,6 +56,32 @@ def __init__(
         self._pages = WeakValueDictionary[str, CrawleePage]()  # Track the pages in the pool
         self._plugins_cycle = itertools.cycle(self._plugins)  # Cycle through the plugins
 
+    @classmethod
+    def with_default_plugin(
+        cls,
+        *,
+        headless: bool | None = None,
+        browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
+        **kwargs: Any,
+    ) -> BrowserPool:
+        """Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
+
+        Args:
+            headless: Whether to run the browser in headless mode.
+            browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
+            kwargs: Additional arguments for default constructor.
+        """
+        plugin_options: dict = defaultdict(dict)
+
+        if headless is not None:
+            plugin_options['browser_options']['headless'] = headless
+
+        if browser_type:
+            plugin_options['browser_type'] = browser_type
+
+        plugin = PlaywrightBrowserPlugin(**plugin_options)
+        return cls(plugins=[plugin], **kwargs)
+
     @property
     def plugins(self) -> Sequence[BaseBrowserPlugin]:
         """Return the browser plugins."""
diff --git a/src/crawlee/playwright_crawler/__init__.py b/src/crawlee/playwright_crawler/__init__.py
index 79103680c..38449256f 100644
--- a/src/crawlee/playwright_crawler/__init__.py
+++ b/src/crawlee/playwright_crawler/__init__.py
@@ -1,2 +1,8 @@
-from .playwright_crawler import PlaywrightCrawler
-from .types import PlaywrightCrawlingContext
+try:
+    from .playwright_crawler import PlaywrightCrawler
+    from .types import PlaywrightCrawlingContext
+except ImportError as exc:
+    raise ImportError(
+        'To import anything from this subpacakge, you need to install the "playwright" extra. '
+        'For example, if you use pip, run "pip install crawlee[playwright]".',
+    ) from exc
diff --git a/src/crawlee/playwright_crawler/playwright_crawler.py b/src/crawlee/playwright_crawler/playwright_crawler.py
index de840fc37..c173dab3c 100644
--- a/src/crawlee/playwright_crawler/playwright_crawler.py
+++ b/src/crawlee/playwright_crawler/playwright_crawler.py
@@ -1,17 +1,11 @@
 from __future__ import annotations
 
-from collections import defaultdict
 from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import Unpack
 
-from crawlee.basic_crawler import (
-    BasicCrawler,
-    BasicCrawlerOptions,
-    BasicCrawlingContext,
-    ContextPipeline,
-)
-from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
+from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
+from crawlee.browsers import BrowserPool
 from crawlee.playwright_crawler.types import PlaywrightCrawlingContext
 
 if TYPE_CHECKING:
@@ -23,21 +17,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):
 
     def __init__(
         self,
-        headless: bool | None = None,
+        browser_pool: BrowserPool | None = None,
         browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
+        headless: bool | None = None,
         **kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
     ) -> None:
         """Create a new instance.
 
         Args:
-            headless: Whether to run the browser in headless mode.
-                This option should not be used if `browser_pool` is provided.
+            browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
             browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
                 This option should not be used if `browser_pool` is provided.
-            kwargs: Additional arguments to be forwarded to the underlying BasicCrawler.
+            headless: Whether to run the browser in headless mode.
+                This option should not be used if `browser_pool` is provided.
+            kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`.
         """
-        browser_pool = kwargs.get('browser_pool')
-
         if browser_pool:
             # Raise an exception if browser_pool is provided together with headless or browser_type arguments.
             if headless is not None or browser_type is not None:
@@ -47,25 +41,22 @@ def __init__(
 
         # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
         else:
-            plugin_options: dict = defaultdict(dict)
-
-            if headless is not None:
-                plugin_options['browser_options']['headless'] = headless
-
-            if browser_type:
-                plugin_options['browser_type'] = browser_type
+            browser_pool = BrowserPool.with_default_plugin(headless=headless, browser_type=browser_type)
 
-            browser_pool = BrowserPool(plugins=[PlaywrightBrowserPlugin(**plugin_options)])
-            kwargs['browser_pool'] = browser_pool
+        self._browser_pool = browser_pool
 
-        kwargs['use_browser_pool'] = True
         kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto)
+        kwargs['_additional_context_managers'] = [self._browser_pool]
+
         super().__init__(**kwargs)
 
     async def _page_goto(
         self,
         context: BasicCrawlingContext,
     ) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
+        if self._browser_pool is None:
+            raise ValueError('Browser pool is not initialized.')
+
         crawlee_page = await self._browser_pool.new_page()
         await crawlee_page.page.goto(context.request.url)
         context.request.loaded_url = crawlee_page.page.url