Skip to content

Commit

Permalink
refactor: Extract IRequestList interface from the RequestList class
Browse files Browse the repository at this point in the history
  • Loading branch information
janbuchar committed May 24, 2024
1 parent a5dca80 commit 265df3f
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 20 deletions.
10 changes: 5 additions & 5 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ import type {
DatasetExportOptions,
FinalStatistics,
GetUserDataFromRequest,
IRequestList,
ProxyInfo,
Request,
RequestList,
RequestOptions,
RequestProvider,
RouterHandler,
Expand Down Expand Up @@ -170,7 +170,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
* > Alternatively, `requests` parameter of {@apilink BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests -
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
*/
requestList?: RequestList;
requestList?: IRequestList;

/**
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
Expand Down Expand Up @@ -444,7 +444,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* A reference to the underlying {@apilink RequestList} class that manages the crawler's {@apilink Request|requests}.
* Only available if used by the crawler.
*/
requestList?: RequestList;
requestList?: IRequestList;

/**
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
Expand Down Expand Up @@ -1166,7 +1166,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* adding it back to the queue after the timeout passes. Returns `true` if the request
* should be ignored and will be reclaimed to the queue once ready.
*/
protected delayRequest(request: Request, source: RequestList | RequestProvider) {
protected delayRequest(request: Request, source: IRequestList | RequestProvider) {
const domain = getDomain(request.url);

if (!domain || !request) {
Expand Down Expand Up @@ -1410,7 +1410,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected async _requestFunctionErrorHandler(
error: Error,
crawlingContext: Context,
source: RequestList | RequestProvider,
source: IRequestList | RequestProvider,
): Promise<void> {
const { request } = crawlingContext;
request.pushErrorMessage(error);
Expand Down
80 changes: 65 additions & 15 deletions packages/core/src/storages/request_list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,64 @@ export const REQUESTS_PERSISTENCE_KEY = 'REQUEST_LIST_REQUESTS';

const CONTENT_TYPE_BINARY = 'application/octet-stream';

/**
* Represents a static list of URLs to crawl.
*/
export interface IRequestList {
/**
* Returns the total number of unique requests present in the list.
*/
length(): number;

/**
* Returns `true` if all requests were already handled and there are no more left.
*/
isFinished(): Promise<boolean>;

/**
* Resolves to `true` if the next call to {@apilink IRequestList.fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
*/
isEmpty(): Promise<boolean>;

/**
* Returns number of handled requests.
*/
handledCount(): number;

/**
* If supported, persists the current state of the request list into the default {@apilink KeyValueStore}.
*/
persistState(): Promise<void>;

/**
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
*/
fetchNextRequest(): Promise<Request | null>;

/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
*/
reclaimRequest(request: Request): Promise<void>;

/**
* Marks request as handled after successful processing.
*/
markRequestHandled(request: Request): Promise<void>;

/**
* @internal
*/
inProgress: Set<string>;
}

export interface RequestListOptions {
/**
* An array of sources of URLs for the {@apilink RequestList}. It can be either an array of strings,
Expand Down Expand Up @@ -229,7 +287,7 @@ export interface RequestListOptions {
* ```
* @category Sources
*/
export class RequestList {
export class RequestList implements IRequestList {
private log = log.child({ prefix: 'RequestList' });

/**
Expand Down Expand Up @@ -570,9 +628,7 @@ export class RequestList {
}

/**
* Resolves to `true` if the next call to {@apilink RequestList.fetchNextRequest} function
* would return `null`, otherwise it resolves to `false`.
* Note that even if the list is empty, there might be some pending requests currently being processed.
* @inheritdoc
*/
async isEmpty(): Promise<boolean> {
this._ensureIsInitialized();
Expand All @@ -581,7 +637,7 @@ export class RequestList {
}

/**
* Returns `true` if all requests were already handled and there are no more left.
* @inheritdoc
*/
async isFinished(): Promise<boolean> {
this._ensureIsInitialized();
Expand All @@ -590,12 +646,7 @@ export class RequestList {
}

/**
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
* Otherwise it gets the next request from sources.
*
* The function's `Promise` resolves to `null` if there are no more
* requests to process.
* @inheritdoc
*/
async fetchNextRequest(): Promise<Request | null> {
this._ensureIsInitialized();
Expand Down Expand Up @@ -631,7 +682,7 @@ export class RequestList {
}

/**
* Marks request as handled after successful processing.
* @inheritdoc
*/
async markRequestHandled(request: Request): Promise<void> {
const { uniqueKey } = request;
Expand All @@ -645,8 +696,7 @@ export class RequestList {
}

/**
* Reclaims request to the list if its processing failed.
* The request will become available in the next `this.fetchNextRequest()`.
* @inheritdoc
*/
async reclaimRequest(request: Request): Promise<void> {
const { uniqueKey } = request;
Expand Down Expand Up @@ -798,7 +848,7 @@ export class RequestList {
}

/**
* Returns number of handled requests.
* @inheritdoc
*/
handledCount(): number {
this._ensureIsInitialized();
Expand Down

0 comments on commit 265df3f

Please sign in to comment.