diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index da07be54fde5..95a9834f05c0 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -14,9 +14,9 @@ import type { DatasetExportOptions, FinalStatistics, GetUserDataFromRequest, + IRequestList, ProxyInfo, Request, - RequestList, RequestOptions, RequestProvider, RouterHandler, @@ -170,7 +170,7 @@ export interface BasicCrawlerOptions Alternatively, `requests` parameter of {@apilink BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests - * it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`. */ - requestList?: RequestList; + requestList?: IRequestList; /** * Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites. @@ -444,7 +444,7 @@ export class BasicCrawler { const { request } = crawlingContext; request.pushErrorMessage(error); diff --git a/packages/core/src/storages/request_list.ts b/packages/core/src/storages/request_list.ts index a46c32d5ef5f..a96b0241e80b 100644 --- a/packages/core/src/storages/request_list.ts +++ b/packages/core/src/storages/request_list.ts @@ -20,6 +20,64 @@ export const REQUESTS_PERSISTENCE_KEY = 'REQUEST_LIST_REQUESTS'; const CONTENT_TYPE_BINARY = 'application/octet-stream'; +/** + * Represents a static list of URLs to crawl. + */ +export interface IRequestList { + /** + * Returns the total number of unique requests present in the list. + */ + length(): number; + + /** + * Returns `true` if all requests were already handled and there are no more left. + */ + isFinished(): Promise; + + /** + * Resolves to `true` if the next call to {@apilink IRequestList.fetchNextRequest} function + * would return `null`, otherwise it resolves to `false`. + * Note that even if the list is empty, there might be some pending requests currently being processed. + */ + isEmpty(): Promise; + + /** + * Returns number of handled requests. + */ + handledCount(): number; + + /** + * If supported, persists the current state of the request list into the default {@apilink KeyValueStore}. + */ + persistState(): Promise; + + /** + * Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed + * using the {@apilink RequestList.reclaimRequest} function, if there is any. + * Otherwise it gets the next request from sources. + * + * The function's `Promise` resolves to `null` if there are no more + * requests to process. + */ + fetchNextRequest(): Promise; + + /** + * Reclaims request to the list if its processing failed. + * The request will become available in the next `this.fetchNextRequest()`. + */ + reclaimRequest(request: Request): Promise; + + /** + * Marks request as handled after successful processing. + */ + markRequestHandled(request: Request): Promise; + + /** + * @internal + */ + inProgress: Set; +} + export interface RequestListOptions { /** * An array of sources of URLs for the {@apilink RequestList}. It can be either an array of strings, @@ -229,7 +287,7 @@ export interface RequestListOptions { * ``` * @category Sources */ -export class RequestList { +export class RequestList implements IRequestList { private log = log.child({ prefix: 'RequestList' }); /** @@ -570,9 +628,7 @@ export class RequestList { } /** - * Resolves to `true` if the next call to {@apilink RequestList.fetchNextRequest} function - * would return `null`, otherwise it resolves to `false`. - * Note that even if the list is empty, there might be some pending requests currently being processed. + * @inheritdoc */ async isEmpty(): Promise { this._ensureIsInitialized(); @@ -581,7 +637,7 @@ export class RequestList { } /** - * Returns `true` if all requests were already handled and there are no more left. + * @inheritdoc */ async isFinished(): Promise { this._ensureIsInitialized(); @@ -590,12 +646,7 @@ export class RequestList { } /** - * Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed - * using the {@apilink RequestList.reclaimRequest} function, if there is any. - * Otherwise it gets the next request from sources. - * - * The function's `Promise` resolves to `null` if there are no more - * requests to process. + * @inheritdoc */ async fetchNextRequest(): Promise { this._ensureIsInitialized(); @@ -631,7 +682,7 @@ export class RequestList { } /** - * Marks request as handled after successful processing. + * @inheritdoc */ async markRequestHandled(request: Request): Promise { const { uniqueKey } = request; @@ -645,8 +696,7 @@ export class RequestList { } /** - * Reclaims request to the list if its processing failed. - * The request will become available in the next `this.fetchNextRequest()`. + * @inheritdoc */ async reclaimRequest(request: Request): Promise { const { uniqueKey } = request; @@ -798,7 +848,7 @@ export class RequestList { } /** - * Returns number of handled requests. + * @inheritdoc */ handledCount(): number { this._ensureIsInitialized();