From b1cb108882ab28d956adfc3d77ba9813507823f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Wed, 9 Aug 2023 17:16:22 +0200 Subject: [PATCH] feat: exceeding maxSessionRotations calls failedRequestHandler (#2029) Any request exceeding the `maxSessionRotations` limit currently kills the crawler. This was intended for early exit on too many hard proxy errors, but proved to be somewhat confusing for users using `retryOnBlocked` (any page that the crawler cannot access due to bot protection kills the run). With this PR, the requests that cross the limit of `maxSessionRotations` now get processed with `failedRequestHandler`. Not sure if this is breaking (definitely might confuse a dev or two), but it's in line with how the crawlers worked before the `SessionError` update (so... it's actually reverting the hidden breaking change?) Closes #2028 --- .../src/internals/basic-crawler.ts | 17 ++++++------- test/core/crawlers/browser_crawler.test.ts | 17 ++++++++----- test/core/crawlers/cheerio_crawler.test.ts | 25 ++++++++++++------- 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index af05c13dbef1..80a87b20d06c 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1188,10 +1188,6 @@ export class BasicCrawler= this.maxSessionRotations) { - throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. ` - + 'This might be caused by a misconfigured proxy or an invalid session pool configuration.'); - } request.sessionRotationCount ??= 0; request.sessionRotationCount++; crawlingContext.session?.retire(); @@ -1302,16 +1298,19 @@ export class BasicCrawler { test('proxy rotation on error works as expected', async () => { const goodProxyUrl = 'http://good.proxy'; const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + const requestHandler = jest.fn(); const browserCrawler = new class extends BrowserCrawlerTest { protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { @@ -811,19 +812,21 @@ describe('BrowserCrawler', () => { maxConcurrency: 1, useSessionPool: true, proxyConfiguration, - requestHandler: async () => {}, + requestHandler, }); await expect(browserCrawler.run()).resolves.not.toThrow(); + expect(requestHandler).toHaveBeenCalledTimes(requestList.length()); }); - test('proxy rotation on error stops after maxSessionRotations limit', async () => { + test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => { const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); + const failedRequestHandler = jest.fn(); /** * The first increment is the base case when the proxy is retrieved for the first time. */ - let numberOfRotations = -1; + let numberOfRotations = -requestList.length(); const browserCrawler = new class extends BrowserCrawlerTest { protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { const { session } = ctx; @@ -846,10 +849,12 @@ describe('BrowserCrawler', () => { maxConcurrency: 1, proxyConfiguration, requestHandler: async () => {}, + failedRequestHandler, }); - await expect(browserCrawler.run()).rejects.toThrow(); - expect(numberOfRotations).toBe(5); + await browserCrawler.run(); + expect(failedRequestHandler).toBeCalledTimes(requestList.length()); + expect(numberOfRotations).toBe(requestList.length() * 5); }); test('proxy rotation logs the original proxy error', async () => { @@ -881,7 +886,7 @@ describe('BrowserCrawler', () => { const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {}); - await expect(crawler.run([serverAddress])).rejects.toThrow(); + await crawler.run([serverAddress]); expect(spy).toBeCalled(); expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError)); diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index e77f8ec3509b..6d1097a9db47 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -698,37 +698,43 @@ describe('CheerioCrawler', () => { test('proxy rotation on error works as expected', async () => { const goodProxyUrl = 'http://good.proxy'; const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + const check = jest.fn(); const crawler = new class extends CheerioCrawler { - protected override async _requestFunction({ proxyUrl }: any): Promise { - if (proxyUrl !== goodProxyUrl) { - throw new Error('Proxy responded with 400 - Bad request'); + protected override async _requestFunction(...args: any[]): Promise { + check(...args); + + if (args[0].proxyUrl === goodProxyUrl) { + return null; } - return null; + throw new Error('Proxy responded with 400 - Bad request'); } }({ - maxRequestRetries: 0, + maxSessionRotations: 2, maxConcurrency: 1, useSessionPool: true, proxyConfiguration, - requestHandler: async () => {}, + requestHandler: () => {}, }); await expect(crawler.run([serverAddress])).resolves.not.toThrow(); + expect(check).toBeCalledWith(expect.objectContaining({ proxyUrl: goodProxyUrl })); }); - test('proxy rotation on error stops after maxSessionRotations limit', async () => { + test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => { const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); /** * The first increment is the base case when the proxy is retrieved for the first time. */ let numberOfRotations = -1; + const failedRequestHandler = jest.fn(); const crawler = new CheerioCrawler({ proxyConfiguration, maxSessionRotations: 5, requestHandler: async () => {}, + failedRequestHandler, }); jest.spyOn(crawler, '_requestAsBrowser' as any).mockImplementation(async ({ proxyUrl }: any) => { @@ -740,7 +746,8 @@ describe('CheerioCrawler', () => { return null; }); - await expect(crawler.run([serverAddress])).rejects.toThrow(); + await crawler.run([serverAddress]); + expect(failedRequestHandler).toBeCalledTimes(1); expect(numberOfRotations).toBe(5); }); @@ -765,7 +772,7 @@ describe('CheerioCrawler', () => { const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {}); - await expect(crawler.run([serverAddress])).rejects.toThrow(); + await crawler.run([serverAddress]); expect(spy).toBeCalled(); expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));