Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: purge request queue on repeated crawler runs #377

Merged
merged 4 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions src/crawlee/basic_crawler/basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,19 @@ def failed_request_handler(
self._failed_request_handler = handler
return handler

async def run(self, requests: Sequence[str | BaseRequestData | Request] | None = None) -> FinalStatistics:
"""Run the crawler until all requests are processed."""
async def run(
self,
requests: Sequence[str | BaseRequestData | Request] | None = None,
*,
purge_request_queue: bool = True,
) -> FinalStatistics:
"""Run the crawler until all requests are processed.

Args:
requests: The requests to be enqueued before the crawler starts
purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
request queue will be purged
"""
if self._running:
raise RuntimeError(
'This crawler instance is already running, you can add more requests to it via `crawler.add_requests()`'
Expand All @@ -339,6 +350,11 @@ async def run(self, requests: Sequence[str | BaseRequestData | Request] | None =
if self._use_session_pool:
await self._session_pool.reset_store()

request_provider = await self.get_request_provider()
if purge_request_queue and isinstance(request_provider, RequestQueue):
await request_provider.drop()
self._request_provider = await RequestQueue.open(configuration=self._configuration)

if requests is not None:
await self.add_requests(requests)

Expand Down
23 changes: 22 additions & 1 deletion tests/unit/basic_crawler/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import asyncio
import json
import logging
from collections import Counter
from dataclasses import dataclass
from datetime import timedelta
from typing import TYPE_CHECKING, Any
Expand Down Expand Up @@ -589,6 +590,26 @@ def test_crawler_log() -> None:
crawler.log.info('Test log message')


async def test_consecutive_runs_purge_request_queue() -> None:
crawler = BasicCrawler()
visit = Mock()

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
visit(context.request.url)

await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])
await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])
await crawler.run(['http://a.com', 'http://b.com', 'http://c.com'])

counter = Counter(args[0][0] for args in visit.call_args_list)
assert counter == {
'http://a.com': 3,
'http://b.com': 3,
'http://c.com': 3,
}


async def test_passes_configuration_to_storages() -> None:
configuration = Configuration(persist_storage=False, purge_on_start=True)

Expand All @@ -602,4 +623,4 @@ async def test_passes_configuration_to_storages() -> None:

request_provider = await crawler.get_request_provider()
assert isinstance(request_provider, RequestQueue)
assert request_provider._configuration is configuration
assert request_provider._configuration is configuration
Loading