feat: exceeding maxSessionRotations calls failedRequestHandler (#2029)

Any request exceeding the `maxSessionRotations` limit currently kills the crawler. This was intended for early exit on too many hard proxy errors, but proved to be somewhat confusing for users using `retryOnBlocked` (any page that the crawler cannot access due to bot protection kills the run). With this PR, the requests that cross the limit of `maxSessionRotations` now get processed with `failedRequestHandler`. Not sure if this is breaking (definitely might confuse a dev or two), but it's in line with how the crawlers worked before the `SessionError` update (so... it's actually reverting the hidden breaking change?) Closes #2028
apify · Aug 9, 2023 · b1cb108 · b1cb108
1 parent b01056d
commit b1cb108
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 24 deletions.
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -1188,10 +1188,6 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     private async _rotateSession(crawlingContext: Context) {
         const { request } = crawlingContext;
 
-        if ((request.sessionRotationCount ?? 0) >= this.maxSessionRotations) {
-            throw new Error(`Request failed because of proxy-related errors ${request.sessionRotationCount} times. `
-                + 'This might be caused by a misconfigured proxy or an invalid session pool configuration.');
-        }
         request.sessionRotationCount ??= 0;
         request.sessionRotationCount++;
         crawlingContext.session?.retire();
@@ -1302,16 +1298,19 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
     }
 
     protected _canRequestBeRetried(request: Request, error: Error) {
+        // Request should never be retried, or the error encountered makes it not able to be retried, or the session rotation limit has been reached
+        if (request.noRetry
+            || (error instanceof NonRetryableError)
+            || (error instanceof SessionError && (this.maxSessionRotations <= (request.sessionRotationCount ?? 0)))
+        ) {
+            return false;
+        }
+
         // User requested retry (we ignore retry count here as its explicitly told by the user to retry)
         if (error instanceof RetryRequestError) {
             return true;
         }
 
-        // Request should never be retried, or the error encountered makes it not able to be retried
-        if (request.noRetry || (error instanceof NonRetryableError)) {
-            return false;
-        }
-
         // Ensure there are more retries available for the request
         const maxRequestRetries = request.maxRetries ?? this.maxRequestRetries;
         return request.retryCount < maxRequestRetries;

diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts
@@ -790,6 +790,7 @@ describe('BrowserCrawler', () => {
         test('proxy rotation on error works as expected', async () => {
             const goodProxyUrl = 'http://good.proxy';
             const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
+            const requestHandler = jest.fn();
 
             const browserCrawler = new class extends BrowserCrawlerTest {
                 protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
@@ -811,19 +812,21 @@ describe('BrowserCrawler', () => {
                 maxConcurrency: 1,
                 useSessionPool: true,
                 proxyConfiguration,
-                requestHandler: async () => {},
+                requestHandler,
             });
 
             await expect(browserCrawler.run()).resolves.not.toThrow();
+            expect(requestHandler).toHaveBeenCalledTimes(requestList.length());
         });
 
-        test('proxy rotation on error stops after maxSessionRotations limit', async () => {
+        test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
             const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });
+            const failedRequestHandler = jest.fn();
 
             /**
              * The first increment is the base case when the proxy is retrieved for the first time.
              */
-            let numberOfRotations = -1;
+            let numberOfRotations = -requestList.length();
             const browserCrawler = new class extends BrowserCrawlerTest {
                 protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise<HTTPResponse | null | undefined> {
                     const { session } = ctx;
@@ -846,10 +849,12 @@ describe('BrowserCrawler', () => {
                 maxConcurrency: 1,
                 proxyConfiguration,
                 requestHandler: async () => {},
+                failedRequestHandler,
             });
 
-            await expect(browserCrawler.run()).rejects.toThrow();
-            expect(numberOfRotations).toBe(5);
+            await browserCrawler.run();
+            expect(failedRequestHandler).toBeCalledTimes(requestList.length());
+            expect(numberOfRotations).toBe(requestList.length() * 5);
         });
 
         test('proxy rotation logs the original proxy error', async () => {
@@ -881,7 +886,7 @@ describe('BrowserCrawler', () => {
 
             const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});
 
-            await expect(crawler.run([serverAddress])).rejects.toThrow();
+            await crawler.run([serverAddress]);
 
             expect(spy).toBeCalled();
             expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));

diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts
@@ -698,37 +698,43 @@ describe('CheerioCrawler', () => {
         test('proxy rotation on error works as expected', async () => {
             const goodProxyUrl = 'http://good.proxy';
             const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] });
+            const check = jest.fn();
 
             const crawler = new class extends CheerioCrawler {
-                protected override async _requestFunction({ proxyUrl }: any): Promise<any> {
-                    if (proxyUrl !== goodProxyUrl) {
-                        throw new Error('Proxy responded with 400 - Bad request');
+                protected override async _requestFunction(...args: any[]): Promise<any> {
+                    check(...args);
+
+                    if (args[0].proxyUrl === goodProxyUrl) {
+                        return null;
                     }
 
-                    return null;
+                    throw new Error('Proxy responded with 400 - Bad request');
                 }
             }({
-                maxRequestRetries: 0,
+                maxSessionRotations: 2,
                 maxConcurrency: 1,
                 useSessionPool: true,
                 proxyConfiguration,
-                requestHandler: async () => {},
+                requestHandler: () => {},
             });
 
             await expect(crawler.run([serverAddress])).resolves.not.toThrow();
+            expect(check).toBeCalledWith(expect.objectContaining({ proxyUrl: goodProxyUrl }));
         });
 
-        test('proxy rotation on error stops after maxSessionRotations limit', async () => {
+        test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => {
             const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] });
 
             /**
              * The first increment is the base case when the proxy is retrieved for the first time.
              */
             let numberOfRotations = -1;
+            const failedRequestHandler = jest.fn();
             const crawler = new CheerioCrawler({
                 proxyConfiguration,
                 maxSessionRotations: 5,
                 requestHandler: async () => {},
+                failedRequestHandler,
             });
 
             jest.spyOn(crawler, '_requestAsBrowser' as any).mockImplementation(async ({ proxyUrl }: any) => {
@@ -740,7 +746,8 @@ describe('CheerioCrawler', () => {
                 return null;
             });
 
-            await expect(crawler.run([serverAddress])).rejects.toThrow();
+            await crawler.run([serverAddress]);
+            expect(failedRequestHandler).toBeCalledTimes(1);
             expect(numberOfRotations).toBe(5);
         });
 
@@ -765,7 +772,7 @@ describe('CheerioCrawler', () => {
 
             const spy = jest.spyOn((crawler as any).log, 'warning' as any).mockImplementation(() => {});
 
-            await expect(crawler.run([serverAddress])).rejects.toThrow();
+            await crawler.run([serverAddress]);
 
             expect(spy).toBeCalled();
             expect(spy.mock.calls[0][0]).toEqual(expect.stringContaining(proxyError));