Skip to content

Commit

Permalink
feat: exceeding maxSessionRotations calls failedRequestHandler (#2029)
Browse files Browse the repository at this point in the history
Any request exceeding the `maxSessionRotations` limit currently kills
the crawler. This was intended for early exit on too many hard proxy
errors, but proved to be somewhat confusing for users using
`retryOnBlocked` (any page that the crawler cannot access due to bot
protection kills the run).

With this PR, the requests that cross the limit of `maxSessionRotations`
now get processed with `failedRequestHandler`. Not sure if this is
breaking (definitely might confuse a dev or two), but it's in line with
how the crawlers worked before the `SessionError` update (so... it's
actually reverting the hidden breaking change?)

Closes #2028
  • Loading branch information
barjin authored Aug 9, 2023
1 parent b01056d commit b1cb108
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 24 deletions.
17 changes: 8 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1188,10 +1188,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
private async _rotateSession(crawlingContext: Context) {
const { request } = crawlingContext;

if ((request.sessionRotationCount ?? 0) >= this.maxSessionRotations) {
throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. `
+ 'This might be caused by a misconfigured proxy or an invalid session pool configuration.');
}
request.sessionRotationCount ??= 0;
request.sessionRotationCount++;
crawlingContext.session?.retire();
Expand Down Expand Up @@ -1302,16 +1298,19 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

protected _canRequestBeRetried(request: Request, error: Error) {
// Request should never be retried, or the error encountered makes it not able to be retried, or the session rotation limit has been reached
if (request.noRetry
|| (error instanceof NonRetryableError)
|| (error instanceof SessionError && (this.maxSessionRotations <= (request.sessionRotationCount ?? 0)))
) {
return false;
}

// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
if (error instanceof RetryRequestError) {
return true;
}

// Request should never be retried, or the error encountered makes it not able to be retried
if (request.noRetry || (error instanceof NonRetryableError)) {
return false;
}

// Ensure there are more retries available for the request
const maxRequestRetries = request.maxRetries ?? this.maxRequestRetries;
return request.retryCount < maxRequestRetries;
Expand Down
17 changes: 11 additions & 6 deletions test/core/crawlers/browser_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,7 @@ describe('BrowserCrawler', () => {
test('proxy rotation on error works as expected', async () => {
const goodProxyUrl = 'http://good.proxy';
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
const requestHandler = jest.fn();

const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
Expand All @@ -811,19 +812,21 @@ describe('BrowserCrawler', () => {
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {},
requestHandler,
});

await expect(browserCrawler.run()).resolves.not.toThrow();
expect(requestHandler).toHaveBeenCalledTimes(requestList.length());
});

test('proxy rotation on error stops after maxSessionRotations limit', async () => {
test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });
const failedRequestHandler = jest.fn();

/**
* The first increment is the base case when the proxy is retrieved for the first time.
*/
let numberOfRotations = -1;
let numberOfRotations = -requestList.length();
const browserCrawler = new class extends BrowserCrawlerTest {
protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
const { session } = ctx;
Expand All @@ -846,10 +849,12 @@ describe('BrowserCrawler', () => {
maxConcurrency: 1,
proxyConfiguration,
requestHandler: async () => {},
failedRequestHandler,
});

await expect(browserCrawler.run()).rejects.toThrow();
expect(numberOfRotations).toBe(5);
await browserCrawler.run();
expect(failedRequestHandler).toBeCalledTimes(requestList.length());
expect(numberOfRotations).toBe(requestList.length() * 5);
});

test('proxy rotation logs the original proxy error', async () => {
Expand Down Expand Up @@ -881,7 +886,7 @@ describe('BrowserCrawler', () => {

const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);

expect(spy).toBeCalled();
expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));
Expand Down
25 changes: 16 additions & 9 deletions test/core/crawlers/cheerio_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -698,37 +698,43 @@ describe('CheerioCrawler', () => {
test('proxy rotation on error works as expected', async () => {
const goodProxyUrl = 'http://good.proxy';
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
const check = jest.fn();

const crawler = new class extends CheerioCrawler {
protected override async _requestFunction({ proxyUrl }: any): Promise<any> {
if (proxyUrl !== goodProxyUrl) {
throw new Error('Proxy responded with 400 - Bad request');
protected override async _requestFunction(...args: any[]): Promise<any> {
check(...args);

if (args[0].proxyUrl === goodProxyUrl) {
return null;
}

return null;
throw new Error('Proxy responded with 400 - Bad request');
}
}({
maxRequestRetries: 0,
maxSessionRotations: 2,
maxConcurrency: 1,
useSessionPool: true,
proxyConfiguration,
requestHandler: async () => {},
requestHandler: () => {},
});

await expect(crawler.run([serverAddress])).resolves.not.toThrow();
expect(check).toBeCalledWith(expect.objectContaining({ proxyUrl: goodProxyUrl }));
});

test('proxy rotation on error stops after maxSessionRotations limit', async () => {
test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });

/**
* The first increment is the base case when the proxy is retrieved for the first time.
*/
let numberOfRotations = -1;
const failedRequestHandler = jest.fn();
const crawler = new CheerioCrawler({
proxyConfiguration,
maxSessionRotations: 5,
requestHandler: async () => {},
failedRequestHandler,
});

jest.spyOn(crawler, '_requestAsBrowser' as any).mockImplementation(async ({ proxyUrl }: any) => {
Expand All @@ -740,7 +746,8 @@ describe('CheerioCrawler', () => {
return null;
});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);
expect(failedRequestHandler).toBeCalledTimes(1);
expect(numberOfRotations).toBe(5);
});

Expand All @@ -765,7 +772,7 @@ describe('CheerioCrawler', () => {

const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});

await expect(crawler.run([serverAddress])).rejects.toThrow();
await crawler.run([serverAddress]);

expect(spy).toBeCalled();
expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));
Expand Down

0 comments on commit b1cb108

Please sign in to comment.