Skip to content

Commit

Permalink
fix: add explicit err msgs for missing pckg extras during import (#165)
Browse files Browse the repository at this point in the history
## Description

- Add explicit error messages for missing package extras during import.
- It applies to BrowserPool, BeautifulsoupCrawler, and
PlaywrightCrawler.
- Also until now the Playwright was mandatory for even `BasicCrawler`,
fixing it.

## Related issues

- Closes #155 

## Testing

### PlaywrightCrawler

```
ImportError                               Traceback (most recent call last)
Cell In[3], line 1
----> 1 from crawlee.playwright_crawler import PlaywrightCrawler

File ~/Projects/crawlee-py/src/crawlee/playwright_crawler/__init__.py:5
      3     from .types import PlaywrightCrawlingContext
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".',
      7     ) from exc

ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".
```

### BeautifulsoupCrawler

```
ImportError                               Traceback (most recent call last)
Cell In[1], line 1
----> 1 from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler

File ~/Projects/crawlee-py/src/crawlee/beautifulsoup_crawler/__init__.py:5
      3     from .types import BeautifulSoupCrawlingContext
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]".',
      7     ) from exc

ImportError: To use this module, you need to install the "beautifulsoup" extra. Run "pip install crawlee[beautifulsoup]".
```

### BrowserPool

```
ImportError                               Traceback (most recent call last)
Cell In[2], line 1
----> 1 from crawlee.browsers import BrowserPool

File ~/Projects/crawlee-py/src/crawlee/browsers/__init__.py:5
      3     from .playwright_browser_plugin import PlaywrightBrowserPlugin
      4 except ImportError as exc:
----> 5     raise ImportError(
      6         'To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".',
      7     ) from exc

ImportError: To use this module, you need to install the "playwright" extra. Run "pip install crawlee[playwright]".
```

## Checklist

- [x] Changes are described in the `CHANGELOG.md`
- [x] CI passed
  • Loading branch information
vdusek authored May 31, 2024
1 parent 382b6f4 commit 200ebfa
Show file tree
Hide file tree
Showing 9 changed files with 83 additions and 55 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

## [0.0.5](../../releases/tag/v0.0.5) - Unreleased

- ...
### Adds

- Add explicit error messages for missing package extras during import

## [0.0.4](../../releases/tag/v0.0.4) - 2024-05-30

Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,6 @@ reportUnusedVariable = false
reportCallInDefaultInitializer = false
reportImplicitStringConcatenation = false
reportAny = false

[tool.ipdb]
context = 7
30 changes: 9 additions & 21 deletions src/crawlee/basic_crawler/basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datetime import timedelta
from functools import partial
from logging import getLogger
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
from typing import TYPE_CHECKING, Any, AsyncContextManager, Callable, Generic, Union, cast

import httpx
from tldextract import TLDExtract
Expand All @@ -18,9 +18,7 @@
from crawlee.autoscaling import AutoscaledPool, ConcurrencySettings
from crawlee.autoscaling.snapshotter import Snapshotter
from crawlee.autoscaling.system_status import SystemStatus
from crawlee.basic_crawler.context_pipeline import (
ContextPipeline,
)
from crawlee.basic_crawler.context_pipeline import ContextPipeline
from crawlee.basic_crawler.errors import (
ContextPipelineInitializationError,
ContextPipelineInterruptedError,
Expand All @@ -29,12 +27,7 @@
UserDefinedErrorHandlerError,
)
from crawlee.basic_crawler.router import Router
from crawlee.basic_crawler.types import (
BasicCrawlingContext,
RequestHandlerRunResult,
SendRequestFunction,
)
from crawlee.browsers import BrowserPool
from crawlee.basic_crawler.types import BasicCrawlingContext, RequestHandlerRunResult, SendRequestFunction
from crawlee.configuration import Configuration
from crawlee.enqueue_strategy import EnqueueStrategy
from crawlee.events.local_event_manager import LocalEventManager
Expand Down Expand Up @@ -76,9 +69,8 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
retry_on_blocked: NotRequired[bool]
proxy_configuration: NotRequired[ProxyConfiguration]
statistics: NotRequired[Statistics[StatisticsState]]
browser_pool: NotRequired[BrowserPool]
use_browser_pool: NotRequired[bool]
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
_additional_context_managers: NotRequired[Sequence[AsyncContextManager]]


class BasicCrawler(Generic[TCrawlingContext]):
Expand Down Expand Up @@ -108,9 +100,8 @@ def __init__(
retry_on_blocked: bool = True,
proxy_configuration: ProxyConfiguration | None = None,
statistics: Statistics | None = None,
browser_pool: BrowserPool | None = None,
use_browser_pool: bool = False,
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
_additional_context_managers: Sequence[AsyncContextManager] | None = None,
) -> None:
"""Initialize the BasicCrawler.
Expand All @@ -131,9 +122,9 @@ def __init__(
proxy_configuration: A HTTP proxy configuration to be used for making requests
statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
browser_pool: A preconfigured `BrowserPool` instance for browser crawling.
use_browser_pool: Enables using the browser pool for crawling.
_context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
_additional_context_managers: Additional context managers to be used in the crawler lifecycle.
"""
self._router: Router[TCrawlingContext] | None = None

Expand Down Expand Up @@ -186,10 +177,7 @@ def __init__(
event_manager=self._event_manager,
log_message=f'{logger.name} request statistics',
)

self._use_browser_pool = use_browser_pool
if self._use_browser_pool:
self._browser_pool = browser_pool or BrowserPool()
self._additional_context_managers = _additional_context_managers or []

self._running = False
self._has_finished_before = False
Expand Down Expand Up @@ -304,8 +292,8 @@ async def run(self, requests: list[str | BaseRequestData] | None = None) -> Fina
if self._use_session_pool:
await exit_stack.enter_async_context(self._session_pool)

if self._use_browser_pool:
await exit_stack.enter_async_context(self._browser_pool)
for context_manager in self._additional_context_managers:
await exit_stack.enter_async_context(context_manager)

await self._pool.run()

Expand Down
10 changes: 8 additions & 2 deletions src/crawlee/beautifulsoup_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .beautifulsoup_crawler import BeautifulSoupCrawler
from .types import BeautifulSoupCrawlingContext
try:
from .beautifulsoup_crawler import BeautifulSoupCrawler
from .types import BeautifulSoupCrawlingContext
except ImportError as exc:
raise ImportError(
'To import anything from this subpacakge, you need to install the "beautifulsoup" extra. '
'For example, if you use pip, run "pip install crawlee[beautifulsoup]".',
) from exc
3 changes: 1 addition & 2 deletions src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import asyncio
from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable, Literal

from bs4 import BeautifulSoup, Tag
from typing_extensions import Unpack

from crawlee._utils.blocked import RETRY_CSS_SELECTORS
Expand Down Expand Up @@ -102,8 +103,6 @@ async def _handle_blocked_request(
async def _parse_http_response(
self, context: HttpCrawlingContext
) -> AsyncGenerator[BeautifulSoupCrawlingContext, None]:
from bs4 import BeautifulSoup, Tag

soup = await asyncio.to_thread(lambda: BeautifulSoup(context.http_response.read(), self._parser))

async def enqueue_links(
Expand Down
10 changes: 8 additions & 2 deletions src/crawlee/browsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .browser_pool import BrowserPool
from .playwright_browser_plugin import PlaywrightBrowserPlugin
try:
from .browser_pool import BrowserPool
from .playwright_browser_plugin import PlaywrightBrowserPlugin
except ImportError as exc:
raise ImportError(
'To import anything from this subpacakge, you need to install the "playwright" extra. '
'For example, if you use pip, run "pip install crawlee[playwright]".',
) from exc
29 changes: 28 additions & 1 deletion src/crawlee/browsers/browser_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import asyncio
import itertools
from collections import defaultdict
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Literal
from weakref import WeakValueDictionary

from crawlee._utils.crypto import crypto_random_object_id
Expand Down Expand Up @@ -55,6 +56,32 @@ def __init__(
self._pages = WeakValueDictionary[str, CrawleePage]() # Track the pages in the pool
self._plugins_cycle = itertools.cycle(self._plugins) # Cycle through the plugins

@classmethod
def with_default_plugin(
cls,
*,
headless: bool | None = None,
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
**kwargs: Any,
) -> BrowserPool:
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
Args:
headless: Whether to run the browser in headless mode.
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
kwargs: Additional arguments for default constructor.
"""
plugin_options: dict = defaultdict(dict)

if headless is not None:
plugin_options['browser_options']['headless'] = headless

if browser_type:
plugin_options['browser_type'] = browser_type

plugin = PlaywrightBrowserPlugin(**plugin_options)
return cls(plugins=[plugin], **kwargs)

@property
def plugins(self) -> Sequence[BaseBrowserPlugin]:
"""Return the browser plugins."""
Expand Down
10 changes: 8 additions & 2 deletions src/crawlee/playwright_crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .playwright_crawler import PlaywrightCrawler
from .types import PlaywrightCrawlingContext
try:
from .playwright_crawler import PlaywrightCrawler
from .types import PlaywrightCrawlingContext
except ImportError as exc:
raise ImportError(
'To import anything from this subpacakge, you need to install the "playwright" extra. '
'For example, if you use pip, run "pip install crawlee[playwright]".',
) from exc
39 changes: 15 additions & 24 deletions src/crawlee/playwright_crawler/playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
from __future__ import annotations

from collections import defaultdict
from typing import TYPE_CHECKING, Literal

from typing_extensions import Unpack

from crawlee.basic_crawler import (
BasicCrawler,
BasicCrawlerOptions,
BasicCrawlingContext,
ContextPipeline,
)
from crawlee.browsers import BrowserPool, PlaywrightBrowserPlugin
from crawlee.basic_crawler import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
from crawlee.browsers import BrowserPool
from crawlee.playwright_crawler.types import PlaywrightCrawlingContext

if TYPE_CHECKING:
Expand All @@ -23,21 +17,21 @@ class PlaywrightCrawler(BasicCrawler[PlaywrightCrawlingContext]):

def __init__(
self,
headless: bool | None = None,
browser_pool: BrowserPool | None = None,
browser_type: Literal['chromium', 'firefox', 'webkit'] | None = None,
headless: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
) -> None:
"""Create a new instance.
Args:
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
This option should not be used if `browser_pool` is provided.
kwargs: Additional arguments to be forwarded to the underlying BasicCrawler.
headless: Whether to run the browser in headless mode.
This option should not be used if `browser_pool` is provided.
kwargs: Additional arguments to be forwarded to the underlying `BasicCrawler`.
"""
browser_pool = kwargs.get('browser_pool')

if browser_pool:
# Raise an exception if browser_pool is provided together with headless or browser_type arguments.
if headless is not None or browser_type is not None:
Expand All @@ -47,25 +41,22 @@ def __init__(

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
plugin_options: dict = defaultdict(dict)

if headless is not None:
plugin_options['browser_options']['headless'] = headless

if browser_type:
plugin_options['browser_type'] = browser_type
browser_pool = BrowserPool.with_default_plugin(headless=headless, browser_type=browser_type)

browser_pool = BrowserPool(plugins=[PlaywrightBrowserPlugin(**plugin_options)])
kwargs['browser_pool'] = browser_pool
self._browser_pool = browser_pool

kwargs['use_browser_pool'] = True
kwargs['_context_pipeline'] = ContextPipeline().compose(self._page_goto)
kwargs['_additional_context_managers'] = [self._browser_pool]

super().__init__(**kwargs)

async def _page_goto(
self,
context: BasicCrawlingContext,
) -> AsyncGenerator[PlaywrightCrawlingContext, None]:
if self._browser_pool is None:
raise ValueError('Browser pool is not initialized.')

crawlee_page = await self._browser_pool.new_page()
await crawlee_page.page.goto(context.request.url)
context.request.loaded_url = crawlee_page.page.url
Expand Down

0 comments on commit 200ebfa

Please sign in to comment.