From c79dc790f876664812915f4365b3c0475a634727 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 1 Mar 2024 13:22:55 +0100 Subject: [PATCH] Reformat everything using biome formatter --- .eslintrc.json | 140 ++-- docs/examples/.eslintrc.json | 22 +- docs/examples/cheerio_crawler.ts | 4 +- docs/examples/crawl_multiple_urls_cheerio.ts | 6 +- .../crawl_multiple_urls_playwright.ts | 6 +- .../examples/crawl_multiple_urls_puppeteer.ts | 6 +- docs/examples/forms.ts | 4 +- docs/examples/http_crawler.ts | 4 +- docs/examples/jsdom_crawler.ts | 4 +- docs/examples/jsdom_crawler_react.ts | 12 +- docs/examples/map.ts | 4 +- docs/examples/reduce.ts | 4 +- docs/guides/avoid_blocking_playwright.ts | 16 +- docs/guides/avoid_blocking_puppeteer.ts | 13 +- .../modified-detail-route.mjs | 4 +- .../parallel-scraping/parallel-scraper.mjs | 47 +- .../proxy_management_inspection_cheerio.ts | 4 +- .../proxy_management_inspection_http.ts | 4 +- .../proxy_management_inspection_jsdom.ts | 4 +- .../proxy_management_inspection_playwright.ts | 4 +- .../proxy_management_inspection_puppeteer.ts | 4 +- .../proxy_management_integration_cheerio.ts | 5 +- .../proxy_management_integration_http.ts | 5 +- .../proxy_management_integration_jsdom.ts | 5 +- ...proxy_management_integration_playwright.ts | 5 +- .../proxy_management_integration_puppeteer.ts | 5 +- .../proxy_management_session_cheerio.ts | 4 +- docs/guides/proxy_management_session_http.ts | 4 +- docs/guides/proxy_management_session_jsdom.ts | 4 +- .../proxy_management_session_playwright.ts | 4 +- .../proxy_management_session_puppeteer.ts | 4 +- .../proxy_management_session_standalone.ts | 8 +- docs/guides/session_management_basic.ts | 4 +- docs/guides/session_management_cheerio.ts | 4 +- docs/guides/session_management_http.ts | 4 +- docs/guides/session_management_jsdom.ts | 4 +- docs/guides/session_management_playwright.ts | 4 +- docs/guides/session_management_puppeteer.ts | 4 +- docs/introduction/03-filter-without-el.ts | 4 +- docs/introduction/03-find-without-el.ts | 4 +- lerna.json | 35 +- .../src/internals/basic-crawler.ts | 320 +++++--- packages/basic-crawler/test/migration.test.ts | 66 +- packages/basic-crawler/tsconfig.build.json | 10 +- .../src/internals/browser-crawler.ts | 159 ++-- .../src/internals/browser-launcher.ts | 26 +- .../browser-crawler/test/migration.test.ts | 44 +- packages/browser-crawler/tsconfig.build.json | 10 +- packages/browser-pool/src/anonymize-proxy.ts | 10 +- packages/browser-pool/src/browser-pool.ts | 154 ++-- .../browser-pool/src/fingerprinting/hooks.ts | 14 +- .../browser-pool/src/fingerprinting/types.ts | 58 +- .../browser-pool/src/fingerprinting/utils.ts | 3 +- .../src/playwright/playwright-controller.ts | 24 +- .../src/playwright/playwright-plugin.ts | 50 +- .../src/puppeteer/puppeteer-controller.ts | 6 +- .../src/puppeteer/puppeteer-plugin.ts | 64 +- packages/browser-pool/src/utils.ts | 52 +- .../tab-as-a-container/background.js | 58 +- .../tab-as-a-container/content.js | 55 +- .../tab-as-a-container/manifest.json | 8 +- .../test/changing-page-options.test.ts | 31 +- .../test/multiple-plugins.test.ts | 15 +- .../test/no-hybrid-plugins.test.ts | 20 +- packages/browser-pool/tsconfig.build.json | 10 +- .../src/internals/cheerio-crawler.ts | 69 +- .../cheerio-crawler/test/migration.test.ts | 44 +- packages/cheerio-crawler/tsconfig.build.json | 10 +- .../cli/src/commands/CreateProjectCommand.ts | 90 ++- .../InstallPlaywrightBrowsersCommand.ts | 9 +- packages/cli/src/index.ts | 16 +- packages/cli/tsconfig.build.json | 10 +- .../core/src/autoscaling/autoscaled_pool.ts | 49 +- packages/core/src/autoscaling/snapshotter.ts | 82 +- .../core/src/autoscaling/system_status.ts | 32 +- packages/core/src/configuration.ts | 4 +- packages/core/src/cookie_utils.ts | 21 +- packages/core/src/crawlers/crawler_commons.ts | 63 +- packages/core/src/crawlers/statistics.ts | 30 +- .../core/src/enqueue_links/enqueue_links.ts | 95 ++- packages/core/src/enqueue_links/shared.ts | 44 +- .../core/src/events/local_event_manager.ts | 21 +- packages/core/src/log.ts | 10 +- packages/core/src/proxy_configuration.ts | 21 +- packages/core/src/request.ts | 41 +- packages/core/src/router.ts | 15 +- packages/core/src/serialization.ts | 22 +- packages/core/src/session_pool/session.ts | 52 +- .../core/src/session_pool/session_pool.ts | 44 +- packages/core/src/storages/access_checking.ts | 5 +- packages/core/src/storages/dataset.ts | 55 +- packages/core/src/storages/key_value_store.ts | 82 +- packages/core/src/storages/request_list.ts | 94 ++- .../core/src/storages/request_provider.ts | 209 +++-- packages/core/src/storages/request_queue.ts | 52 +- .../core/src/storages/request_queue_v2.ts | 41 +- packages/core/src/storages/storage_manager.ts | 16 +- packages/core/src/storages/utils.ts | 26 +- packages/core/src/typedefs.ts | 11 +- .../core/test/enqueue_links/userData.test.ts | 4 +- ...me-request-should-not-call-the-api.test.ts | 9 +- .../request-queue/request-queue-v2.test.ts | 19 +- packages/core/tsconfig.build.json | 10 +- packages/crawlee/tsconfig.build.json | 10 +- .../src/internals/http-crawler.ts | 154 ++-- packages/http-crawler/tsconfig.build.json | 10 +- .../src/internals/jsdom-crawler.ts | 63 +- packages/jsdom-crawler/tsconfig.build.json | 10 +- .../src/internals/linkedom-crawler.ts | 38 +- packages/linkedom-crawler/tsconfig.build.json | 10 +- .../src/background-handler/fs-utils.ts | 37 +- .../src/background-handler/index.ts | 11 +- packages/memory-storage/src/body-parser.ts | 15 +- packages/memory-storage/src/cache-helpers.ts | 24 +- .../memory-storage/src/fs/dataset/index.ts | 4 +- .../src/fs/key-value-store/fs.ts | 12 +- .../src/fs/key-value-store/index.ts | 4 +- .../src/fs/request-queue/index.ts | 4 +- packages/memory-storage/src/memory-storage.ts | 39 +- .../resource-clients/dataset-collection.ts | 4 +- .../src/resource-clients/dataset.ts | 54 +- .../key-value-store-collection.ts | 10 +- .../src/resource-clients/key-value-store.ts | 45 +- .../request-queue-collection.ts | 10 +- .../src/resource-clients/request-queue.ts | 80 +- packages/memory-storage/src/utils.ts | 12 +- .../memory-storage/test/fs-fallback.test.ts | 55 +- .../test/no-crash-on-big-buffers.test.ts | 2 +- .../test/no-writing-to-disk.test.ts | 4 +- .../handledRequestCount-should-update.test.ts | 12 +- .../ignore-non-json-files.test.ts | 46 +- .../test/write-metadata.test.ts | 5 +- packages/memory-storage/tsconfig.build.json | 10 +- .../internals/adaptive-playwright-crawler.ts | 171 ++-- .../internals/enqueue-links/click-elements.ts | 117 ++- .../src/internals/playwright-crawler.ts | 43 +- .../src/internals/playwright-launcher.ts | 43 +- .../src/internals/utils/playwright-utils.ts | 201 +++-- .../utils/rendering-type-prediction.ts | 29 +- .../src/logistic-regression.d.ts | 30 +- .../playwright-crawler/tsconfig.build.json | 10 +- .../internals/enqueue-links/click-elements.ts | 112 ++- .../src/internals/puppeteer-crawler.ts | 44 +- .../src/internals/puppeteer-launcher.ts | 18 +- .../utils/puppeteer_request_interception.ts | 7 +- .../src/internals/utils/puppeteer_utils.ts | 194 +++-- .../puppeteer-crawler/tsconfig.build.json | 10 +- packages/templates/.eslintrc.json | 14 +- packages/templates/manifest.json | 216 ++--- .../templates/scripts/validate-manifest.mjs | 13 +- packages/templates/src/index.ts | 34 +- packages/templates/tsconfig.build.json | 12 +- packages/types/src/storages.ts | 2 - packages/types/tsconfig.build.json | 10 +- packages/utils/src/internals/blocked.ts | 4 +- packages/utils/src/internals/cheerio.ts | 18 +- packages/utils/src/internals/debug.ts | 5 +- packages/utils/src/internals/error_tracker.ts | 31 +- packages/utils/src/internals/extract-urls.ts | 28 +- packages/utils/src/internals/general.ts | 32 +- packages/utils/src/internals/memory-info.ts | 13 +- .../utils/src/internals/open_graph_parser.ts | 37 +- packages/utils/src/internals/robots.ts | 12 +- packages/utils/src/internals/sitemap.ts | 11 +- packages/utils/src/internals/social.ts | 67 +- packages/utils/test/robots.test.ts | 35 +- packages/utils/test/sitemap.test.ts | 256 +++--- packages/utils/tsconfig.build.json | 12 +- renovate.json | 65 +- scripts/actions/docker-images/main.ts | 13 +- scripts/actions/docker-images/state.json | 18 +- scripts/copy.ts | 4 +- scripts/typescript_fixes.mjs | 12 +- .../browser-plugins/plugins.test.ts | 82 +- test/browser-pool/browser-pool.test.ts | 118 +-- test/core/autoscaling/autoscaled_pool.test.ts | 36 +- test/core/autoscaling/snapshotter.test.ts | 28 +- test/core/autoscaling/system_status.test.ts | 7 +- .../adaptive_playwright_crawler.test.ts | 141 ++-- test/core/crawlers/basic_browser_crawler.ts | 11 +- test/core/crawlers/basic_crawler.test.ts | 246 +++--- test/core/crawlers/browser_crawler.test.ts | 144 ++-- test/core/crawlers/cheerio_crawler.test.ts | 163 ++-- test/core/crawlers/crawler_extension.test.ts | 6 +- test/core/crawlers/dom_crawler.test.ts | 15 +- test/core/crawlers/http_crawler.test.ts | 44 +- test/core/crawlers/playwright_crawler.test.ts | 22 +- test/core/crawlers/puppeteer_crawler.test.ts | 76 +- .../core/enqueue_links/click_elements.test.ts | 53 +- test/core/enqueue_links/enqueue_links.test.ts | 174 ++--- test/core/enqueue_links/shared.test.ts | 8 +- test/core/error_tracker.test.ts | 246 ++++-- test/core/playwright_utils.test.ts | 535 +++++++------ test/core/proxy_configuration.test.ts | 20 +- .../puppeteer_request_interception.test.ts | 53 +- test/core/puppeteer_utils.test.ts | 735 +++++++++--------- test/core/request_list.test.ts | 90 +-- test/core/router.test.ts | 14 +- test/core/session_pool/session.test.ts | 13 +- test/core/session_pool/session_pool.test.ts | 33 +- test/core/session_pool/session_utils.test.ts | 5 +- test/core/storages/dataset.test.ts | 201 ++--- test/core/storages/key_value_store.test.ts | 105 ++- test/core/storages/request_queue.test.ts | 82 +- test/e2e/.eslintrc.json | 10 +- .../e2e/automatic-persist-value/actor/main.js | 5 +- test/e2e/automatic-persist-value/test.mjs | 5 +- .../actor/main.js | 5 +- .../cheerio-default-ts/actor/.eslintrc.json | 2 +- test/e2e/cheerio-default/actor/main.js | 9 +- .../cheerio-enqueue-links-base/actor/main.js | 5 +- test/e2e/cheerio-enqueue-links/actor/main.js | 5 +- .../cheerio-ignore-ssl-errors/actor/main.js | 10 +- test/e2e/cheerio-ignore-ssl-errors/test.mjs | 5 +- .../e2e/cheerio-initial-cookies/actor/main.js | 40 +- test/e2e/cheerio-initial-cookies/test.mjs | 4 +- test/e2e/cheerio-max-requests/actor/main.js | 24 +- test/e2e/cheerio-page-info/actor/main.js | 5 +- .../cheerio-request-queue-v2/actor/main.js | 5 +- .../cheerio-throw-on-ssl-errors/actor/main.js | 9 +- test/e2e/input-json5/actor/main.js | 5 +- test/e2e/input-json5/test.mjs | 14 +- .../e2e/jsdom-default-ts/actor/.eslintrc.json | 2 +- test/e2e/jsdom-react-ts/actor/.eslintrc.json | 2 +- test/e2e/jsdom-react-ts/actor/main.ts | 4 +- .../linkedom-default-ts/actor/.eslintrc.json | 2 +- test/e2e/migration/actor/main.js | 5 +- .../actor/main.js | 13 +- test/e2e/playwright-default/actor/main.js | 13 +- .../actor/main.js | 5 +- .../playwright-enqueue-links/actor/main.js | 5 +- .../actor/main.js | 13 +- .../playwright-initial-cookies/actor/main.js | 38 +- test/e2e/playwright-initial-cookies/test.mjs | 4 +- .../actor/main.js | 11 +- test/e2e/proxy-rotation/actor/main.js | 12 +- test/e2e/puppeteer-default/actor/main.js | 13 +- .../e2e/puppeteer-enqueue-links/actor/main.js | 5 +- .../puppeteer-ignore-ssl-errors/actor/main.js | 18 +- .../puppeteer-initial-cookies/actor/main.js | 38 +- test/e2e/puppeteer-initial-cookies/test.mjs | 4 +- test/e2e/puppeteer-page-info/actor/main.js | 34 +- .../actor/main.js | 36 +- .../puppeteer-store-pagination/actor/main.js | 37 +- .../actor/main.js | 18 +- .../actor/main.js | 5 +- .../e2e/request-skip-navigation/actor/main.js | 11 +- test/e2e/run.mjs | 17 +- test/e2e/session-rotation/actor/main.js | 5 +- test/e2e/session-rotation/test.mjs | 6 +- test/e2e/tools.mjs | 42 +- test/shared/_helper.ts | 32 +- test/shared/data/html_to_text_test_data.ts | 15 +- test/utils/cheerio.test.ts | 5 +- test/utils/extract-urls.test.ts | 36 +- test/utils/general.test.ts | 32 +- test/utils/social.test.ts | 534 +++++++------ tsconfig.build.json | 37 +- turbo.json | 36 +- vitest.config.ts | 6 +- 260 files changed, 6182 insertions(+), 4669 deletions(-) diff --git a/.eslintrc.json b/.eslintrc.json index 5e7a61e610c7..3d27e7e2c400 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -1,66 +1,78 @@ { - "root": true, - "env": { - "browser": true, - "es2020": true, - "node": true - }, - "extends": ["@apify/eslint-config-ts", "prettier"], - "parserOptions": { - "project": "./tsconfig.eslint.json", - "ecmaVersion": 2022 - }, - "ignorePatterns": [ - "node_modules", - "dist", - "coverage", - "**/*.d.ts" - ], - "overrides": [ - { - "plugins": [ - "@typescript-eslint" - ], - "files": [ - "*.ts" - ], - "rules": { - "@typescript-eslint/array-type": "error", - "@typescript-eslint/ban-ts-comment": 0, - "@typescript-eslint/consistent-type-imports": ["error", { - "disallowTypeAnnotations": false - }], - "@typescript-eslint/consistent-type-definitions": ["error", "interface"], - "@typescript-eslint/member-delimiter-style": ["error", { - "multiline": { "delimiter": "semi", "requireLast": true }, - "singleline": { "delimiter": "semi", "requireLast": false } - }], - "@typescript-eslint/no-empty-interface": "off", - "no-empty-function": "off", - "@typescript-eslint/no-empty-function": "off", - "@typescript-eslint/no-explicit-any": "off", - "@typescript-eslint/no-floating-promises": "error", - "@typescript-eslint/no-unused-vars": "off", - "@typescript-eslint/comma-dangle": ["error", "always-multiline"] - } - } - ], - "rules": { - "quote-props": ["error", "consistent"], - "import/no-extraneous-dependencies": "off", - "max-classes-per-file": 0, - "no-console": "error", - "no-underscore-dangle": 0, - "no-void": 0, - "max-len": ["error", { - "code": 160, - "ignoreUrls": true, - "ignoreComments": true - }], - "import/order": ["error", { - "groups": ["builtin", "external", ["parent", "sibling"], "index", "object"], - "alphabetize": { "order": "asc", "caseInsensitive": true }, - "newlines-between": "always" - }] - } + "root": true, + "env": { + "browser": true, + "es2020": true, + "node": true + }, + "extends": ["@apify/eslint-config-ts", "prettier"], + "parserOptions": { + "project": "./tsconfig.eslint.json", + "ecmaVersion": 2022 + }, + "ignorePatterns": ["node_modules", "dist", "coverage", "**/*.d.ts"], + "overrides": [ + { + "plugins": ["@typescript-eslint"], + "files": ["*.ts"], + "rules": { + "@typescript-eslint/array-type": "error", + "@typescript-eslint/ban-ts-comment": 0, + "@typescript-eslint/consistent-type-imports": [ + "error", + { + "disallowTypeAnnotations": false + } + ], + "@typescript-eslint/consistent-type-definitions": [ + "error", + "interface" + ], + "@typescript-eslint/member-delimiter-style": [ + "error", + { + "multiline": { "delimiter": "semi", "requireLast": true }, + "singleline": { "delimiter": "semi", "requireLast": false } + } + ], + "@typescript-eslint/no-empty-interface": "off", + "no-empty-function": "off", + "@typescript-eslint/no-empty-function": "off", + "@typescript-eslint/no-explicit-any": "off", + "@typescript-eslint/no-floating-promises": "error", + "@typescript-eslint/no-unused-vars": "off", + "@typescript-eslint/comma-dangle": ["error", "always-multiline"] + } + } + ], + "rules": { + "quote-props": ["error", "consistent"], + "import/no-extraneous-dependencies": "off", + "max-classes-per-file": 0, + "no-console": "error", + "no-underscore-dangle": 0, + "no-void": 0, + "max-len": [ + "error", + { + "code": 160, + "ignoreUrls": true, + "ignoreComments": true + } + ], + "import/order": [ + "error", + { + "groups": [ + "builtin", + "external", + ["parent", "sibling"], + "index", + "object" + ], + "alphabetize": { "order": "asc", "caseInsensitive": true }, + "newlines-between": "always" + } + ] + } } diff --git a/docs/examples/.eslintrc.json b/docs/examples/.eslintrc.json index d41f6dbc8164..f14aaaa0cbf6 100644 --- a/docs/examples/.eslintrc.json +++ b/docs/examples/.eslintrc.json @@ -1,13 +1,13 @@ { - "root": true, - "extends": "@apify/ts", - "parserOptions": { - "project": "./tsconfig.eslint.json", - "ecmaVersion": 2022 - }, - "rules": { - "import/extensions": 0, - "import/no-extraneous-dependencies": 0, - "no-console": "off" - } + "root": true, + "extends": "@apify/ts", + "parserOptions": { + "project": "./tsconfig.eslint.json", + "ecmaVersion": 2022 + }, + "rules": { + "import/extensions": 0, + "import/no-extraneous-dependencies": 0, + "no-console": "off" + } } diff --git a/docs/examples/cheerio_crawler.ts b/docs/examples/cheerio_crawler.ts index c496c15727da..2181333db797 100644 --- a/docs/examples/cheerio_crawler.ts +++ b/docs/examples/cheerio_crawler.ts @@ -57,8 +57,6 @@ const crawler = new CheerioCrawler({ }); // Run the crawler and wait for it to finish. -await crawler.run([ - 'https://crawlee.dev', -]); +await crawler.run(['https://crawlee.dev']); log.debug('Crawler finished.'); diff --git a/docs/examples/crawl_multiple_urls_cheerio.ts b/docs/examples/crawl_multiple_urls_cheerio.ts index 011d8b7d6ca6..9312a5f83e4c 100644 --- a/docs/examples/crawl_multiple_urls_cheerio.ts +++ b/docs/examples/crawl_multiple_urls_cheerio.ts @@ -9,8 +9,4 @@ const crawler = new CheerioCrawler({ }); // Run the crawler with initial request -await crawler.run([ - 'http://www.example.com/page-1', - 'http://www.example.com/page-2', - 'http://www.example.com/page-3', -]); +await crawler.run(['http://www.example.com/page-1', 'http://www.example.com/page-2', 'http://www.example.com/page-3']); diff --git a/docs/examples/crawl_multiple_urls_playwright.ts b/docs/examples/crawl_multiple_urls_playwright.ts index 9581bea3f248..f175277e09c3 100644 --- a/docs/examples/crawl_multiple_urls_playwright.ts +++ b/docs/examples/crawl_multiple_urls_playwright.ts @@ -9,8 +9,4 @@ const crawler = new PlaywrightCrawler({ }); // Run the crawler with initial request -await crawler.run([ - 'http://www.example.com/page-1', - 'http://www.example.com/page-2', - 'http://www.example.com/page-3', -]); +await crawler.run(['http://www.example.com/page-1', 'http://www.example.com/page-2', 'http://www.example.com/page-3']); diff --git a/docs/examples/crawl_multiple_urls_puppeteer.ts b/docs/examples/crawl_multiple_urls_puppeteer.ts index d7c6a99a5fc7..7cb5bf70ece4 100644 --- a/docs/examples/crawl_multiple_urls_puppeteer.ts +++ b/docs/examples/crawl_multiple_urls_puppeteer.ts @@ -9,8 +9,4 @@ const crawler = new PuppeteerCrawler({ }); // Run the crawler with initial request -await crawler.run([ - 'http://www.example.com/page-1', - 'http://www.example.com/page-2', - 'http://www.example.com/page-3', -]); +await crawler.run(['http://www.example.com/page-1', 'http://www.example.com/page-2', 'http://www.example.com/page-3']); diff --git a/docs/examples/forms.ts b/docs/examples/forms.ts index 29677f91d101..818ae1e42782 100644 --- a/docs/examples/forms.ts +++ b/docs/examples/forms.ts @@ -23,8 +23,8 @@ await Promise.all([ ]); // Obtain and print list of search results -const results = await page.$$eval('[data-testid="results-list"] div.search-title > a', - (nodes) => nodes.map((node) => ({ +const results = await page.$$eval('[data-testid="results-list"] div.search-title > a', (nodes) => + nodes.map((node) => ({ url: node.href, name: node.innerText, })), diff --git a/docs/examples/http_crawler.ts b/docs/examples/http_crawler.ts index 4052df13136b..e573a1533234 100644 --- a/docs/examples/http_crawler.ts +++ b/docs/examples/http_crawler.ts @@ -47,8 +47,6 @@ const crawler = new HttpCrawler({ }); // Run the crawler and wait for it to finish. -await crawler.run([ - 'https://crawlee.dev', -]); +await crawler.run(['https://crawlee.dev']); log.debug('Crawler finished.'); diff --git a/docs/examples/jsdom_crawler.ts b/docs/examples/jsdom_crawler.ts index e72034113c2d..8f75f4b7c10e 100644 --- a/docs/examples/jsdom_crawler.ts +++ b/docs/examples/jsdom_crawler.ts @@ -57,8 +57,6 @@ const crawler = new JSDOMCrawler({ }); // Run the crawler and wait for it to finish. -await crawler.run([ - 'https://crawlee.dev', -]); +await crawler.run(['https://crawlee.dev']); log.debug('Crawler finished.'); diff --git a/docs/examples/jsdom_crawler_react.ts b/docs/examples/jsdom_crawler_react.ts index 8b3ec163bc6d..67e55e175042 100644 --- a/docs/examples/jsdom_crawler_react.ts +++ b/docs/examples/jsdom_crawler_react.ts @@ -1,9 +1,9 @@ import { JSDOMCrawler, log } from 'crawlee'; -// Create an instance of the JSDOMCrawler class - crawler that automatically +// Create an instance of the JSDOMCrawler class - crawler that automatically // loads the URLs and parses their HTML using the jsdom library. const crawler = new JSDOMCrawler({ - // Setting the `runScripts` option to `true` allows the crawler to execute client-side + // Setting the `runScripts` option to `true` allows the crawler to execute client-side // JavaScript code on the page. This is required for some websites (such as the React application in this example), but may pose a security risk. runScripts: true, // This function will be called for each crawled URL. @@ -18,15 +18,13 @@ const crawler = new JSDOMCrawler({ document.querySelectorAll('button')[18].click(); // = const result = document.querySelectorAll('.component-display')[0].childNodes[0] as Element; - // The result is passed to the console. Unlike with Playwright or Puppeteer crawlers, + // The result is passed to the console. Unlike with Playwright or Puppeteer crawlers, // this console call goes to the Node.js console, not the browser console. All the code here runs right in Node.js! log.info(result.innerHTML); // 2 }, }); // Run the crawler and wait for it to finish. -await crawler.run([ - 'https://ahfarmer.github.io/calculator/', -]); +await crawler.run(['https://ahfarmer.github.io/calculator/']); -log.debug('Crawler finished.'); \ No newline at end of file +log.debug('Crawler finished.'); diff --git a/docs/examples/map.ts b/docs/examples/map.ts index a06142a912f1..55c1ab03a9b5 100644 --- a/docs/examples/map.ts +++ b/docs/examples/map.ts @@ -1,8 +1,8 @@ import { Dataset, KeyValueStore } from 'crawlee'; const dataset = await Dataset.open<{ - url: string, - headingCount: number, + url: string; + headingCount: number; }>(); // Seeding the dataset with some data diff --git a/docs/examples/reduce.ts b/docs/examples/reduce.ts index 243966c90703..cb2b2848781f 100644 --- a/docs/examples/reduce.ts +++ b/docs/examples/reduce.ts @@ -1,8 +1,8 @@ import { Dataset, KeyValueStore } from 'crawlee'; const dataset = await Dataset.open<{ - url: string, - headingCount: number, + url: string; + headingCount: number; }>(); // Seeding the dataset with some data diff --git a/docs/guides/avoid_blocking_playwright.ts b/docs/guides/avoid_blocking_playwright.ts index 8a8543504c75..1e5e23aeae42 100644 --- a/docs/guides/avoid_blocking_playwright.ts +++ b/docs/guides/avoid_blocking_playwright.ts @@ -6,16 +6,14 @@ const crawler = new PlaywrightCrawler({ useFingerprints: true, // this is the default fingerprintOptions: { fingerprintGeneratorOptions: { - browsers: [{ - name: BrowserName.edge, - minVersion: 96, - }], - devices: [ - DeviceCategory.desktop, - ], - operatingSystems: [ - OperatingSystemsName.windows, + browsers: [ + { + name: BrowserName.edge, + minVersion: 96, + }, ], + devices: [DeviceCategory.desktop], + operatingSystems: [OperatingSystemsName.windows], }, }, }, diff --git a/docs/guides/avoid_blocking_puppeteer.ts b/docs/guides/avoid_blocking_puppeteer.ts index 6a4a2ea2eec7..5af3ca7fa3b5 100644 --- a/docs/guides/avoid_blocking_puppeteer.ts +++ b/docs/guides/avoid_blocking_puppeteer.ts @@ -6,16 +6,9 @@ const crawler = new PuppeteerCrawler({ useFingerprints: true, // this is the default fingerprintOptions: { fingerprintGeneratorOptions: { - browsers: [ - BrowserName.chrome, - BrowserName.firefox, - ], - devices: [ - DeviceCategory.mobile, - ], - locales: [ - 'en-US', - ], + browsers: [BrowserName.chrome, BrowserName.firefox], + devices: [DeviceCategory.mobile], + locales: ['en-US'], }, }, }, diff --git a/docs/guides/parallel-scraping/modified-detail-route.mjs b/docs/guides/parallel-scraping/modified-detail-route.mjs index e7bc1b74727a..ad7a807f7e7f 100644 --- a/docs/guides/parallel-scraping/modified-detail-route.mjs +++ b/docs/guides/parallel-scraping/modified-detail-route.mjs @@ -5,9 +5,7 @@ router.addHandler('DETAIL', async ({ request, page, log }) => { const manufacturer = urlPart[0].split('-')[0]; // 'sennheiser' const title = await page.locator('.product-meta h1').textContent(); - const sku = await page - .locator('span.product-meta__sku-number') - .textContent(); + const sku = await page.locator('span.product-meta__sku-number').textContent(); const priceElement = page .locator('span.price') diff --git a/docs/guides/parallel-scraping/parallel-scraper.mjs b/docs/guides/parallel-scraping/parallel-scraper.mjs index 1ea13da71d48..6bee4f4ff13a 100644 --- a/docs/guides/parallel-scraping/parallel-scraper.mjs +++ b/docs/guides/parallel-scraping/parallel-scraper.mjs @@ -50,12 +50,14 @@ if (!process.env.IN_WORKER_THREAD) { await Dataset.pushData(data); }); - promises.push(new Promise((resolve) => { - proc.once('exit', (code, signal) => { - log.info(`Process ${i} exited with code ${code} and signal ${signal}`); - resolve(); - }); - })); + promises.push( + new Promise((resolve) => { + proc.once('exit', (code, signal) => { + log.info(`Process ${i} exited with code ${code} and signal ${signal}`); + resolve(); + }); + }), + ); } await Promise.all(promises); @@ -86,22 +88,25 @@ if (!process.env.IN_WORKER_THREAD) { }); workerLogger.debug('Setting up crawler.'); - const crawler = new PlaywrightCrawler({ - log: workerLogger, - // Instead of the long requestHandler with - // if clauses we provide a router instance. - requestHandler: router, - // Enable the request locking experiment so that we can actually use the queue. - // highlight-start - experiments: { - requestLocking: true, + const crawler = new PlaywrightCrawler( + { + log: workerLogger, + // Instead of the long requestHandler with + // if clauses we provide a router instance. + requestHandler: router, + // Enable the request locking experiment so that we can actually use the queue. + // highlight-start + experiments: { + requestLocking: true, + }, + // Provide the request queue we've pre-filled in previous steps + requestQueue, + // highlight-end + // Let's also limit the crawler's concurrency, we don't want to overload a single process 🐌 + maxConcurrency: 5, }, - // Provide the request queue we've pre-filled in previous steps - requestQueue, - // highlight-end - // Let's also limit the crawler's concurrency, we don't want to overload a single process 🐌 - maxConcurrency: 5, - }, config); + config, + ); await crawler.run(); } diff --git a/docs/guides/proxy_management_inspection_cheerio.ts b/docs/guides/proxy_management_inspection_cheerio.ts index 68bdd2125100..842be0e380a3 100644 --- a/docs/guides/proxy_management_inspection_cheerio.ts +++ b/docs/guides/proxy_management_inspection_cheerio.ts @@ -1,6 +1,8 @@ import { CheerioCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new CheerioCrawler({ proxyConfiguration, diff --git a/docs/guides/proxy_management_inspection_http.ts b/docs/guides/proxy_management_inspection_http.ts index 076350877719..759da4f0baed 100644 --- a/docs/guides/proxy_management_inspection_http.ts +++ b/docs/guides/proxy_management_inspection_http.ts @@ -1,6 +1,8 @@ import { HttpCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new HttpCrawler({ proxyConfiguration, diff --git a/docs/guides/proxy_management_inspection_jsdom.ts b/docs/guides/proxy_management_inspection_jsdom.ts index db7607143e70..3ab09968a585 100644 --- a/docs/guides/proxy_management_inspection_jsdom.ts +++ b/docs/guides/proxy_management_inspection_jsdom.ts @@ -1,6 +1,8 @@ import { JSDOMCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new JSDOMCrawler({ proxyConfiguration, diff --git a/docs/guides/proxy_management_inspection_playwright.ts b/docs/guides/proxy_management_inspection_playwright.ts index db4b2d277e2d..9eaade17ce68 100644 --- a/docs/guides/proxy_management_inspection_playwright.ts +++ b/docs/guides/proxy_management_inspection_playwright.ts @@ -1,6 +1,8 @@ import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PlaywrightCrawler({ proxyConfiguration, diff --git a/docs/guides/proxy_management_inspection_puppeteer.ts b/docs/guides/proxy_management_inspection_puppeteer.ts index 154ce0068fca..e119571d8687 100644 --- a/docs/guides/proxy_management_inspection_puppeteer.ts +++ b/docs/guides/proxy_management_inspection_puppeteer.ts @@ -1,6 +1,8 @@ import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PuppeteerCrawler({ proxyConfiguration, diff --git a/docs/guides/proxy_management_integration_cheerio.ts b/docs/guides/proxy_management_integration_cheerio.ts index 74f9878c0952..642f27c42d90 100644 --- a/docs/guides/proxy_management_integration_cheerio.ts +++ b/docs/guides/proxy_management_integration_cheerio.ts @@ -1,10 +1,7 @@ import { CheerioCrawler, ProxyConfiguration } from 'crawlee'; const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], + proxyUrls: ['http://proxy-1.com', 'http://proxy-2.com'], }); const crawler = new CheerioCrawler({ diff --git a/docs/guides/proxy_management_integration_http.ts b/docs/guides/proxy_management_integration_http.ts index 0db0a9ab9221..920b976c63ad 100644 --- a/docs/guides/proxy_management_integration_http.ts +++ b/docs/guides/proxy_management_integration_http.ts @@ -1,10 +1,7 @@ import { HttpCrawler, ProxyConfiguration } from 'crawlee'; const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], + proxyUrls: ['http://proxy-1.com', 'http://proxy-2.com'], }); const crawler = new HttpCrawler({ diff --git a/docs/guides/proxy_management_integration_jsdom.ts b/docs/guides/proxy_management_integration_jsdom.ts index 4164ef8cff32..f8a60ddabf63 100644 --- a/docs/guides/proxy_management_integration_jsdom.ts +++ b/docs/guides/proxy_management_integration_jsdom.ts @@ -1,10 +1,7 @@ import { JSDOMCrawler, ProxyConfiguration } from 'crawlee'; const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], + proxyUrls: ['http://proxy-1.com', 'http://proxy-2.com'], }); const crawler = new JSDOMCrawler({ diff --git a/docs/guides/proxy_management_integration_playwright.ts b/docs/guides/proxy_management_integration_playwright.ts index ad41b377552a..81e4f80e615b 100644 --- a/docs/guides/proxy_management_integration_playwright.ts +++ b/docs/guides/proxy_management_integration_playwright.ts @@ -1,10 +1,7 @@ import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], + proxyUrls: ['http://proxy-1.com', 'http://proxy-2.com'], }); const crawler = new PlaywrightCrawler({ diff --git a/docs/guides/proxy_management_integration_puppeteer.ts b/docs/guides/proxy_management_integration_puppeteer.ts index f4dc43ac3939..74a552ba2931 100644 --- a/docs/guides/proxy_management_integration_puppeteer.ts +++ b/docs/guides/proxy_management_integration_puppeteer.ts @@ -1,10 +1,7 @@ import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; const proxyConfiguration = new ProxyConfiguration({ - proxyUrls: [ - 'http://proxy-1.com', - 'http://proxy-2.com', - ], + proxyUrls: ['http://proxy-1.com', 'http://proxy-2.com'], }); const crawler = new PuppeteerCrawler({ diff --git a/docs/guides/proxy_management_session_cheerio.ts b/docs/guides/proxy_management_session_cheerio.ts index f8484e6ced38..bb19a5b88d35 100644 --- a/docs/guides/proxy_management_session_cheerio.ts +++ b/docs/guides/proxy_management_session_cheerio.ts @@ -1,6 +1,8 @@ import { CheerioCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new CheerioCrawler({ useSessionPool: true, diff --git a/docs/guides/proxy_management_session_http.ts b/docs/guides/proxy_management_session_http.ts index 601e26812a21..c8c289de4877 100644 --- a/docs/guides/proxy_management_session_http.ts +++ b/docs/guides/proxy_management_session_http.ts @@ -1,6 +1,8 @@ import { HttpCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new HttpCrawler({ useSessionPool: true, diff --git a/docs/guides/proxy_management_session_jsdom.ts b/docs/guides/proxy_management_session_jsdom.ts index 2321f97de433..98e71d904070 100644 --- a/docs/guides/proxy_management_session_jsdom.ts +++ b/docs/guides/proxy_management_session_jsdom.ts @@ -1,6 +1,8 @@ import { JSDOMCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new JSDOMCrawler({ useSessionPool: true, diff --git a/docs/guides/proxy_management_session_playwright.ts b/docs/guides/proxy_management_session_playwright.ts index 2208684698e3..70edcb79a033 100644 --- a/docs/guides/proxy_management_session_playwright.ts +++ b/docs/guides/proxy_management_session_playwright.ts @@ -1,6 +1,8 @@ import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PlaywrightCrawler({ useSessionPool: true, diff --git a/docs/guides/proxy_management_session_puppeteer.ts b/docs/guides/proxy_management_session_puppeteer.ts index 5bcbb2806cab..fcd1e14427f2 100644 --- a/docs/guides/proxy_management_session_puppeteer.ts +++ b/docs/guides/proxy_management_session_puppeteer.ts @@ -1,6 +1,8 @@ import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PuppeteerCrawler({ useSessionPool: true, diff --git a/docs/guides/proxy_management_session_standalone.ts b/docs/guides/proxy_management_session_standalone.ts index 3723d5072aca..bc2010f79b18 100644 --- a/docs/guides/proxy_management_session_standalone.ts +++ b/docs/guides/proxy_management_session_standalone.ts @@ -1,8 +1,12 @@ import { ProxyConfiguration, SessionPool } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); -const sessionPool = await SessionPool.open({ /* opts */ }); +const sessionPool = await SessionPool.open({ + /* opts */ +}); const session = await sessionPool.getSession(); diff --git a/docs/guides/session_management_basic.ts b/docs/guides/session_management_basic.ts index 8e7d41cca23c..4fecdea476f8 100644 --- a/docs/guides/session_management_basic.ts +++ b/docs/guides/session_management_basic.ts @@ -1,7 +1,9 @@ import { BasicCrawler, ProxyConfiguration } from 'crawlee'; import { gotScraping } from 'got-scraping'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new BasicCrawler({ // Activates the Session pool (default is true). diff --git a/docs/guides/session_management_cheerio.ts b/docs/guides/session_management_cheerio.ts index 397256b95e40..15f50f18b910 100644 --- a/docs/guides/session_management_cheerio.ts +++ b/docs/guides/session_management_cheerio.ts @@ -1,6 +1,8 @@ import { CheerioCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new CheerioCrawler({ // To use the proxy IP session rotation logic, you must turn the proxy usage on. diff --git a/docs/guides/session_management_http.ts b/docs/guides/session_management_http.ts index b6d40d3ecdb9..e57317c53f2d 100644 --- a/docs/guides/session_management_http.ts +++ b/docs/guides/session_management_http.ts @@ -1,6 +1,8 @@ import { HttpCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new HttpCrawler({ // To use the proxy IP session rotation logic, you must turn the proxy usage on. diff --git a/docs/guides/session_management_jsdom.ts b/docs/guides/session_management_jsdom.ts index 6f57ca9bab37..f541354d2a90 100644 --- a/docs/guides/session_management_jsdom.ts +++ b/docs/guides/session_management_jsdom.ts @@ -1,6 +1,8 @@ import { JSDOMCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new JSDOMCrawler({ // To use the proxy IP session rotation logic, you must turn the proxy usage on. diff --git a/docs/guides/session_management_playwright.ts b/docs/guides/session_management_playwright.ts index f4ea5651d4d3..aecf3b5c6dcf 100644 --- a/docs/guides/session_management_playwright.ts +++ b/docs/guides/session_management_playwright.ts @@ -1,6 +1,8 @@ import { PlaywrightCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PlaywrightCrawler({ // To use the proxy IP session rotation logic, you must turn the proxy usage on. diff --git a/docs/guides/session_management_puppeteer.ts b/docs/guides/session_management_puppeteer.ts index 8f7aad903949..57f4c37df004 100644 --- a/docs/guides/session_management_puppeteer.ts +++ b/docs/guides/session_management_puppeteer.ts @@ -1,6 +1,8 @@ import { PuppeteerCrawler, ProxyConfiguration } from 'crawlee'; -const proxyConfiguration = new ProxyConfiguration({ /* opts */ }); +const proxyConfiguration = new ProxyConfiguration({ + /* opts */ +}); const crawler = new PuppeteerCrawler({ // To use the proxy IP session rotation logic, you must turn the proxy usage on. diff --git a/docs/introduction/03-filter-without-el.ts b/docs/introduction/03-filter-without-el.ts index e50b05d8f321..3ddeb4efe5cb 100644 --- a/docs/introduction/03-filter-without-el.ts +++ b/docs/introduction/03-filter-without-el.ts @@ -14,9 +14,7 @@ const crawler = new CheerioCrawler({ // Besides resolving the URLs, we now also need to // grab their hostname for filtering. const { hostname } = new URL(request.loadedUrl); - const absoluteUrls = links.map( - (link) => new URL(link, request.loadedUrl), - ); + const absoluteUrls = links.map((link) => new URL(link, request.loadedUrl)); // We use the hostname to filter links that point // to a different domain, even subdomain. diff --git a/docs/introduction/03-find-without-el.ts b/docs/introduction/03-find-without-el.ts index 33561d0190f3..4c9ec20db061 100644 --- a/docs/introduction/03-find-without-el.ts +++ b/docs/introduction/03-find-without-el.ts @@ -17,9 +17,7 @@ const crawler = new CheerioCrawler({ // Then we need to resolve relative URLs, // otherwise they would be unusable for crawling. - const absoluteUrls = links.map( - (link) => new URL(link, request.loadedUrl).href, - ); + const absoluteUrls = links.map((link) => new URL(link, request.loadedUrl).href); // Finally, we have to add the URLs to the queue await crawler.addRequests(absoluteUrls); diff --git a/lerna.json b/lerna.json index 9aa2ffcdca6e..8a6fca6acd67 100644 --- a/lerna.json +++ b/lerna.json @@ -1,22 +1,17 @@ { - "packages": [ - "packages/*" - ], - "version": "3.8.1", - "command": { - "version": { - "conventionalCommits": true, - "createRelease": "github", - "message": "chore(release): %s" - }, - "publish": { - "assets": [] - } - }, - "npmClient": "yarn", - "useNx": false, - "ignoreChanges": [ - "**/test/**", - "**/*.md" - ] + "packages": ["packages/*"], + "version": "3.8.1", + "command": { + "version": { + "conventionalCommits": true, + "createRelease": "github", + "message": "chore(release): %s" + }, + "publish": { + "assets": [] + } + }, + "npmClient": "yarn", + "useNx": false, + "ignoreChanges": ["**/test/**", "**/*.md"] } diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index cccf38095800..7d170d812e3a 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -59,9 +59,8 @@ import ow, { ArgumentError } from 'ow'; import { getDomain } from 'tldts'; import type { SetRequired } from 'type-fest'; -export interface BasicCrawlingContext< - UserData extends Dictionary = Dictionary, -> extends CrawlingContext { +export interface BasicCrawlingContext + extends CrawlingContext { /** * This function automatically finds and enqueues links from the current page, adding them to the {@apilink RequestQueue} * currently used by the crawler. @@ -99,9 +98,14 @@ export interface BasicCrawlingContext< */ const SAFE_MIGRATION_WAIT_MILLIS = 20000; -export type RequestHandler = (inputs: Context) => Awaitable; +export type RequestHandler = ( + inputs: Context, +) => Awaitable; -export type ErrorHandler = (inputs: Context, error: Error) => Awaitable; +export type ErrorHandler = ( + inputs: Context, + error: Error, +) => Awaitable; export interface StatusMessageCallbackParams< Context extends CrawlingContext = BasicCrawlingContext, @@ -539,7 +543,10 @@ export class BasicCrawler = {}, readonly config = Configuration.getGlobalConfig()) { + constructor( + options: BasicCrawlerOptions = {}, + readonly config = Configuration.getGlobalConfig(), + ) { ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape)); const { @@ -593,11 +600,13 @@ export class BasicCrawler (val == null ? null : +val); // allow at least 5min for internal timeouts - this.internalTimeoutMillis = tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3); + this.internalTimeoutMillis = + tryEnv(process.env.CRAWLEE_INTERNAL_TIMEOUT) ?? Math.max(this.requestHandlerTimeoutMillis * 2, 300e3); // override the default internal timeout of request queue to respect `requestHandlerTimeoutMillis` if (this.requestQueue) { @@ -662,7 +672,11 @@ export class BasicCrawler maxSignedInteger) { - log.warning(`requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` - + ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`); + log.warning( + `requestHandlerTimeoutMillis ${this.requestHandlerTimeoutMillis}` + + ` does not fit a signed 32-bit integer. Limiting the value to ${maxSignedInteger}`, + ); this.requestHandlerTimeoutMillis = maxSignedInteger; } @@ -705,8 +723,10 @@ export class BasicCrawler { if (isMaxPagesExceeded()) { if (shouldLogMaxPagesExceeded) { - log.info('Crawler reached the maxRequestsPerCrawl limit of ' - + `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`); + log.info( + 'Crawler reached the maxRequestsPerCrawl limit of ' + + `${maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`, + ); shouldLogMaxPagesExceeded = false; } return false; @@ -716,9 +736,11 @@ export class BasicCrawler { if (isMaxPagesExceeded()) { - log.info(`Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` - + 'and all requests that were in progress at that time have now finished. ' - + `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`); + log.info( + `Earlier, the crawler reached the maxRequestsPerCrawl limit of ${maxRequestsPerCrawl} requests ` + + 'and all requests that were in progress at that time have now finished. ' + + `In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`, + ); return true; } @@ -728,7 +750,7 @@ export class BasicCrawler { if (this.running) { - throw new Error('This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.'); + throw new Error( + 'This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.', + ); } const purgeRequestQueue = options?.purgeRequestQueue ?? true; @@ -861,7 +895,9 @@ export class BasicCrawler { - this.log.warning('Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start'); + this.log.warning( + 'Pausing... Press CTRL+C again to force exit. To resume, do: CRAWLEE_PURGE_ON_START=0 npm start', + ); await this._pauseOnMigration(); await this.autoscaledPool!.abort(); }; @@ -917,7 +953,12 @@ export class BasicCrawler { + async addRequests( + requests: (string | Source)[], + options: CrawlerAddRequestsOptions = {}, + ): Promise { const requestQueue = await this.getRequestQueue(); return requestQueue.addRequestsBatched(requests, options); } @@ -988,7 +1034,9 @@ export class BasicCrawler Object.values(item)), - ]); + const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]); await ensureDir(dirname(path)); await writeFile(path, value); this.log.info(`Export to ${path} finished!`); @@ -1055,38 +1100,40 @@ export class BasicCrawler { - if (err.message.includes('running tasks did not finish')) { - this.log.error('The crawler was paused due to migration to another host, ' - + 'but some requests did not finish in time. Those requests\' results may be duplicated.'); - } else { - throw err; - } - }); + await this.autoscaledPool.pause(SAFE_MIGRATION_WAIT_MILLIS).catch((err) => { + if (err.message.includes('running tasks did not finish')) { + this.log.error( + 'The crawler was paused due to migration to another host, ' + + "but some requests did not finish in time. Those requests' results may be duplicated.", + ); + } else { + throw err; + } + }); } const requestListPersistPromise = (async () => { if (this.requestList) { if (await this.requestList.isFinished()) return; - await this.requestList.persistState() - .catch((err) => { - if (err.message.includes('Cannot persist state.')) { - this.log.error('The crawler attempted to persist its request list\'s state and failed due to missing or ' - + 'invalid config. Make sure to use either RequestList.open() or the "stateKeyPrefix" option of RequestList ' - + 'constructor to ensure your crawling state is persisted through host migrations and restarts.'); - } else { - this.log.exception(err, 'An unexpected error occurred when the crawler ' - + 'attempted to persist its request list\'s state.'); - } - }); + await this.requestList.persistState().catch((err) => { + if (err.message.includes('Cannot persist state.')) { + this.log.error( + "The crawler attempted to persist its request list's state and failed due to missing or " + + 'invalid config. Make sure to use either RequestList.open() or the "stateKeyPrefix" option of RequestList ' + + 'constructor to ensure your crawling state is persisted through host migrations and restarts.', + ); + } else { + this.log.exception( + err, + 'An unexpected error occurred when the crawler ' + + "attempted to persist its request list's state.", + ); + } + }); } })(); - await Promise.all([ - requestListPersistPromise, - this.stats.persistState(), - ]); + await Promise.all([requestListPersistPromise, this.stats.persistState()]); } /** @@ -1104,7 +1151,10 @@ export class BasicCrawler= this.sameDomainDelayMillis) { + if (!lastAccessTime || now - lastAccessTime >= this.sameDomainDelayMillis) { this.domainAccessedTime.set(domain, now); return false; } @@ -1143,7 +1191,9 @@ export class BasicCrawler { this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`); // eslint-disable-next-line dot-notation @@ -1159,7 +1209,7 @@ export class BasicCrawler { - const cookieJar = session ? { - getCookieString: async (url: string) => session!.getCookieString(url), - setCookie: async (rawCookie: string, url: string) => session!.setCookie(rawCookie, url), - ...overrideOptions?.cookieJar, - } : overrideOptions?.cookieJar; + const cookieJar = session + ? { + getCookieString: async (url: string) => session!.getCookieString(url), + setCookie: async (rawCookie: string, url: string) => session!.setCookie(rawCookie, url), + ...overrideOptions?.cookieJar, + } + : overrideOptions?.cookieJar; return gotScraping({ url: request!.url, @@ -1254,7 +1306,9 @@ export class BasicCrawler source.markRequestHandled(request!), this.internalTimeoutMillis, - `Marking request ${request.url} (${request.id}) as handled timed out after ${this.internalTimeoutMillis / 1e3} seconds.`, + `Marking request ${request.url} (${request.id}) as handled timed out after ${ + this.internalTimeoutMillis / 1e3 + } seconds.`, ); this.stats.finishJob(statisticsId); @@ -1269,17 +1323,25 @@ export class BasicCrawler this._requestFunctionErrorHandler(err as Error, crawlingContext, source), this.internalTimeoutMillis, - `Handling request failure of ${request.url} (${request.id}) timed out after ${this.internalTimeoutMillis / 1e3} seconds.`, + `Handling request failure of ${request.url} (${request.id}) timed out after ${ + this.internalTimeoutMillis / 1e3 + } seconds.`, ); request.state = RequestState.DONE; } catch (secondaryError: any) { - if (!secondaryError.triggeredFromUserHandler + if ( + !secondaryError.triggeredFromUserHandler && // avoid reprinting the same critical error multiple times, as it will be printed by Nodejs at the end anyway - && !(secondaryError instanceof CriticalError)) { + !(secondaryError instanceof CriticalError) + ) { const apifySpecific = process.env.APIFY_IS_AT_HOME - ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.` : ''; - this.log.exception(secondaryError as Error, 'An exception occurred during handling of failed request. ' - + `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`); + ? `This may have happened due to an internal error of Apify's API or due to a misconfigured crawler.` + : ''; + this.log.exception( + secondaryError as Error, + 'An exception occurred during handling of failed request. ' + + `This places the crawler and its underlying storages into an unknown state and crawling will be terminated. ${apifySpecific}`, + ); } request.state = RequestState.ERROR; throw secondaryError; @@ -1297,11 +1359,18 @@ export class BasicCrawler Promise, timeout: number, error: Error | string, maxRetries = 3, retried = 1): Promise { + protected async _timeoutAndRetry( + handler: () => Promise, + timeout: number, + error: Error | string, + maxRetries = 3, + retried = 1, + ): Promise { try { await addTimeoutToPromise(handler, timeout, error); } catch (e) { - if (retried <= maxRetries) { // we retry on any error, not just timeout + if (retried <= maxRetries) { + // we retry on any error, not just timeout this.log.warning(`${(e as Error).message} (retrying ${retried}/${maxRetries})`); return this._timeoutAndRetry(handler, timeout, error, maxRetries, retried + 1); } @@ -1315,7 +1384,7 @@ export class BasicCrawler this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error)); + await this._tagUserHandlerError(() => + this.errorHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error), + ); } if (!request.noRetry) { @@ -1379,10 +1447,11 @@ export class BasicCrawler(cb: () => unknown): Promise { try { - return await cb() as T; + return (await cb()) as T; } catch (e: any) { Object.defineProperty(e, 'triggeredFromUserHandler', { value: true }); throw e; @@ -1415,13 +1484,12 @@ export class BasicCrawler this.failedRequestHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error)); + await this._tagUserHandlerError(() => + this.failedRequestHandler?.(this._augmentContextWithDeprecatedError(crawlingContext, error), error), + ); } } @@ -1444,16 +1512,17 @@ export class BasicCrawler { // eslint-disable-next-line max-len - this.log.deprecated("The 'error' property of the crawling context is deprecated, and it is now passed as the second parameter in 'errorHandler' and 'failedRequestHandler'. Please update your code, as this property will be removed in a future version."); + this.log.deprecated( + "The 'error' property of the crawling context is deprecated, and it is now passed as the second parameter in 'errorHandler' and 'failedRequestHandler'. Please update your code, as this property will be removed in a future version.", + ); return error; }, @@ -1498,7 +1569,10 @@ export class BasicCrawler Awaitable>(hooks: HookLike[], ...args: Parameters) { + protected async _executeHooks Awaitable>( + hooks: HookLike[], + ...args: Parameters + ) { if (Array.isArray(hooks) && hooks.length) { for (const hook of hooks) { await hook(...args); @@ -1533,19 +1607,23 @@ export class BasicCrawler) { if (newProperty && oldProperty) { - this.log.warning([ - `Both "${newName}" and "${oldName}" were provided in the crawler options.`, - `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`, - `As such, "${newName}" will be used instead.`, - ].join('\n')); + this.log.warning( + [ + `Both "${newName}" and "${oldName}" were provided in the crawler options.`, + `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`, + `As such, "${newName}" will be used instead.`, + ].join('\n'), + ); // @ts-expect-error Assigning to possibly readonly properties this[propertyKey] = newProperty; } else if (oldProperty) { - this.log.warning([ - `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`, - `The provided value will be used, but you should rename "${oldName}" to "${newName}" in your crawler options.`, - ].join('\n')); + this.log.warning( + [ + `"${oldName}" has been renamed to "${newName}", and will be removed in a future version.`, + `The provided value will be used, but you should rename "${oldName}" to "${newName}" in your crawler options.`, + ].join('\n'), + ); // @ts-expect-error Assigning to possibly readonly properties this[propertyKey] = oldProperty; @@ -1559,7 +1637,9 @@ export class BasicCrawler { handleRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "requestHandler" and "handleRequestFunction" were provided in the crawler options.`, - `"handleRequestFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `As such, "requestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "requestHandler" and "handleRequestFunction" were provided in the crawler options.`, + `"handleRequestFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `As such, "requestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandler']).toBe(newHandler); @@ -56,10 +58,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handleRequestFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handleRequestFunction" to "requestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handleRequestFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handleRequestFunction" to "requestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandler']).toBe(oldHandler); @@ -96,11 +100,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `As such, "failedRequestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `As such, "failedRequestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(newHandler); @@ -117,10 +123,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(oldHandler); @@ -156,11 +164,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleRequestTimeoutSecs: 69, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "requestHandlerTimeoutSecs" and "handleRequestTimeoutSecs" were provided in the crawler options.`, - `"handleRequestTimeoutSecs" has been renamed to "requestHandlerTimeoutSecs", and will be removed in a future version.`, - `As such, "requestHandlerTimeoutSecs" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "requestHandlerTimeoutSecs" and "handleRequestTimeoutSecs" were provided in the crawler options.`, + `"handleRequestTimeoutSecs" has been renamed to "requestHandlerTimeoutSecs", and will be removed in a future version.`, + `As such, "requestHandlerTimeoutSecs" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandlerTimeoutMillis']).toEqual(420_000); @@ -176,10 +186,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleRequestTimeoutSecs: 69, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handleRequestTimeoutSecs" has been renamed to "requestHandlerTimeoutSecs", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handleRequestTimeoutSecs" to "requestHandlerTimeoutSecs" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handleRequestTimeoutSecs" has been renamed to "requestHandlerTimeoutSecs", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handleRequestTimeoutSecs" to "requestHandlerTimeoutSecs" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandlerTimeoutMillis']).toEqual(69_000); diff --git a/packages/basic-crawler/tsconfig.build.json b/packages/basic-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/basic-crawler/tsconfig.build.json +++ b/packages/basic-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 3d07c0ac6742..9ec8debc000d 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -56,14 +56,16 @@ export interface BrowserCrawlingContext< response?: Response; } -export type BrowserRequestHandler = RequestHandler; +export type BrowserRequestHandler = + RequestHandler; -export type BrowserErrorHandler = ErrorHandler; +export type BrowserErrorHandler = + ErrorHandler; -export type BrowserHook< - Context = BrowserCrawlingContext, - GoToOptions extends Dictionary | undefined = Dictionary, -> = (crawlingContext: Context, gotoOptions: GoToOptions) => Awaitable; +export type BrowserHook = ( + crawlingContext: Context, + gotoOptions: GoToOptions, +) => Awaitable; export interface BrowserCrawlerOptions< Context extends BrowserCrawlingContext = BrowserCrawlingContext, @@ -72,16 +74,14 @@ export interface BrowserCrawlerOptions< __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>, > extends Omit< - BasicCrawlerOptions, - // Overridden with browser context - | 'requestHandler' - | 'handleRequestFunction' - - | 'failedRequestHandler' - | 'handleFailedRequestFunction' - - | 'errorHandler' -> { + BasicCrawlerOptions, + // Overridden with browser context + | 'requestHandler' + | 'handleRequestFunction' + | 'failedRequestHandler' + | 'handleFailedRequestFunction' + | 'errorHandler' + > { launchContext?: BrowserLaunchContext; /** @@ -186,7 +186,8 @@ export interface BrowserCrawlerOptions< * Custom options passed to the underlying {@apilink BrowserPool} constructor. * We can tweak those to fine-tune browser management. */ - browserPoolOptions?: Partial & Partial>; + browserPoolOptions?: Partial & + Partial>; /** * If set, the crawler will be configured for all connections to use @@ -339,7 +340,10 @@ export abstract class BrowserCrawler< /** * All `BrowserCrawler` parameters are passed via an options object. */ - protected constructor(options: BrowserCrawlerOptions = {}, override readonly config = Configuration.getGlobalConfig()) { + protected constructor( + options: BrowserCrawlerOptions = {}, + override readonly config = Configuration.getGlobalConfig(), + ) { ow(options, 'BrowserCrawlerOptions', ow.object.exactShape(BrowserCrawler.optionsShape)); const { navigationTimeoutSecs = 60, @@ -362,11 +366,15 @@ export abstract class BrowserCrawler< ...basicCrawlerOptions } = options; - super({ - ...basicCrawlerOptions, - requestHandler: async (...args) => this._runRequestHandler(...args), - requestHandlerTimeoutSecs: navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, - }, config); + super( + { + ...basicCrawlerOptions, + requestHandler: async (...args) => this._runRequestHandler(...args), + requestHandlerTimeoutSecs: + navigationTimeoutSecs + requestHandlerTimeoutSecs + BASIC_CRAWLER_TIMEOUT_BUFFER_SECS, + }, + config, + ); this._handlePropertyNameChange({ newName: 'requestHandler', @@ -414,22 +422,17 @@ export abstract class BrowserCrawler< } if (launchContext?.userAgent) { - if (browserPoolOptions.useFingerprints) this.log.info('Custom user agent provided, disabling automatic browser fingerprint injection!'); + if (browserPoolOptions.useFingerprints) + this.log.info('Custom user agent provided, disabling automatic browser fingerprint injection!'); browserPoolOptions.useFingerprints = false; } const { preLaunchHooks = [], postLaunchHooks = [], ...rest } = browserPoolOptions; this.browserPool = new BrowserPool({ - ...rest as any, - preLaunchHooks: [ - this._extendLaunchContext.bind(this), - ...preLaunchHooks, - ], - postLaunchHooks: [ - this._maybeAddSessionRetiredListener.bind(this), - ...postLaunchHooks, - ], + ...(rest as any), + preLaunchHooks: [this._extendLaunchContext.bind(this), ...preLaunchHooks], + postLaunchHooks: [this._maybeAddSessionRetiredListener.bind(this), ...postLaunchHooks], }); } @@ -443,9 +446,7 @@ export abstract class BrowserCrawler< } private async containsSelectors(page: CommonPage, selectors: string[]): Promise { - const foundSelectors = (await Promise.all( - selectors.map((selector) => (page as any).$(selector))) - ) + const foundSelectors = (await Promise.all(selectors.map((selector) => (page as any).$(selector)))) .map((x, i) => [x, selectors[i]] as [any, string]) .filter(([x]) => x !== null) .map(([, selector]) => selector); @@ -457,13 +458,14 @@ export abstract class BrowserCrawler< const { page, response } = crawlingContext; // eslint-disable-next-line dot-notation - const blockedStatusCodes = ((this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0) - // eslint-disable-next-line dot-notation - ? this.sessionPool!['blockedStatusCodes'] - : DEFAULT_BLOCKED_STATUS_CODES; + const blockedStatusCodes = + (this.sessionPool?.['blockedStatusCodes'].length ?? 0) > 0 + ? // eslint-disable-next-line dot-notation + this.sessionPool!['blockedStatusCodes'] + : DEFAULT_BLOCKED_STATUS_CODES; // Cloudflare specific heuristic - wait 5 seconds if we get a 403 for the JS challenge to load / resolve. - if (await this.containsSelectors(page, CLOUDFLARE_RETRY_CSS_SELECTORS) && response?.status() === 403) { + if ((await this.containsSelectors(page, CLOUDFLARE_RETRY_CSS_SELECTORS)) && response?.status() === 403) { await sleep(5000); // here we cannot test for response code, because we only have the original response, not the possible Cloudflare redirect on passed challenge. @@ -512,7 +514,7 @@ export abstract class BrowserCrawler< } } - const page = await this.browserPool.newPage(newPageOptions) as CommonPage; + const page = (await this.browserPool.newPage(newPageOptions)) as CommonPage; tryCancel(); this._enhanceCrawlingContextWithPageInfo(crawlingContext, page, useIncognitoPages || experimentalContainers); @@ -573,14 +575,20 @@ export abstract class BrowserCrawler< if (session) session.markGood(); } - protected _enhanceCrawlingContextWithPageInfo(crawlingContext: Context, page: CommonPage, createNewSession?: boolean): void { + protected _enhanceCrawlingContextWithPageInfo( + crawlingContext: Context, + page: CommonPage, + createNewSession?: boolean, + ): void { crawlingContext.page = page; // This switch is because the crawlingContexts are created on per request basis. // However, we need to add the proxy info and session from browser, which is created based on the browser-pool configuration. // We would not have to do this switch if the proxy and configuration worked as in CheerioCrawler, // which configures proxy and session for every new request - const browserControllerInstance = this.browserPool.getBrowserControllerByPage(page as any) as Context['browserController']; + const browserControllerInstance = this.browserPool.getBrowserControllerByPage( + page as any, + ) as Context['browserController']; crawlingContext.browserController = browserControllerInstance; if (!createNewSession) { @@ -616,7 +624,7 @@ export abstract class BrowserCrawler< await this._applyCookies(crawlingContext, preNavigationHooksCookies, postNavigationHooksCookies); try { - crawlingContext.response = await this._navigationHandler(crawlingContext, gotoOptions) ?? undefined; + crawlingContext.response = (await this._navigationHandler(crawlingContext, gotoOptions)) ?? undefined; } catch (error) { await this._handleNavigationTimeout(crawlingContext, error as Error); @@ -631,18 +639,18 @@ export abstract class BrowserCrawler< await this._executeHooks(this.postNavigationHooks, crawlingContext, gotoOptions); } - protected async _applyCookies({ session, request, page, browserController }: Context, preHooksCookies: string, postHooksCookies: string) { + protected async _applyCookies( + { session, request, page, browserController }: Context, + preHooksCookies: string, + postHooksCookies: string, + ) { const sessionCookie = session?.getCookies(request.url) ?? []; const parsedPreHooksCookies = preHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c)); const parsedPostHooksCookies = postHooksCookies.split(/ *; */).map((c) => cookieStringToToughCookie(c)); await browserController.setCookies( page, - [ - ...sessionCookie, - ...parsedPreHooksCookies, - ...parsedPostHooksCookies, - ] + [...sessionCookie, ...parsedPreHooksCookies, ...parsedPostHooksCookies] .filter((c): c is CookieObject => typeof c !== 'undefined' && c !== null) .map((c) => ({ ...c, url: c.domain ? undefined : request.url })), ); @@ -670,7 +678,10 @@ export abstract class BrowserCrawler< } } - protected abstract _navigationHandler(crawlingContext: Context, gotoOptions: GoToOptions): Promise; + protected abstract _navigationHandler( + crawlingContext: Context, + gotoOptions: GoToOptions, + ): Promise; /** * Should be overridden in case of different automation library that does not support this response API. @@ -726,7 +737,9 @@ export abstract class BrowserCrawler< const { launchContext } = browserController; if (session.id === (launchContext.session as Session).id) { this.browserPool.retireBrowserController( - browserController as Parameters['retireBrowserController']>[0], + browserController as Parameters< + BrowserPool['retireBrowserController'] + >[0], ); } }; @@ -772,7 +785,11 @@ export async function browserCrawlerEnqueueLinks({ userProvidedBaseUrl: options?.baseUrl, }); - const urls = await extractUrlsFromPage(page as any, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); + const urls = await extractUrlsFromPage( + page as any, + options?.selector ?? 'a', + options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl, + ); return enqueueLinks({ requestQueue, @@ -787,8 +804,15 @@ export async function browserCrawlerEnqueueLinks({ * @ignore */ // eslint-disable-next-line @typescript-eslint/ban-types -export async function extractUrlsFromPage(page: { $$eval: Function }, selector: string, baseUrl: string): Promise { - const urls = await page.$$eval(selector, (linkEls: HTMLLinkElement[]) => linkEls.map((link) => link.getAttribute('href')).filter((href) => !!href)) ?? []; +export async function extractUrlsFromPage( + page: { $$eval: Function }, + selector: string, + baseUrl: string, +): Promise { + const urls = + (await page.$$eval(selector, (linkEls: HTMLLinkElement[]) => + linkEls.map((link) => link.getAttribute('href')).filter((href) => !!href), + )) ?? []; const [base] = await page.$$eval('base', (els: HTMLLinkElement[]) => els.map((el) => el.getAttribute('href'))); const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl); @@ -796,17 +820,18 @@ export async function extractUrlsFromPage(page: { $$eval: Function }, selector: baseUrl = absoluteBaseUrl; } - return urls.map((href: string) => { - // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. - const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. - if (!isHrefAbsolute && !baseUrl) { - throw new Error(`An extracted URL: ${href} is relative and options.baseUrl is not set. ` - + 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.'); - } + return urls + .map((href: string) => { + // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. + const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. + if (!isHrefAbsolute && !baseUrl) { + throw new Error( + `An extracted URL: ${href} is relative and options.baseUrl is not set. ` + + 'Use options.baseUrl in enqueueLinks() to automatically resolve relative URLs.', + ); + } - return baseUrl - ? tryAbsoluteURL(href, baseUrl) - : href; - }) + return baseUrl ? tryAbsoluteURL(href, baseUrl) : href; + }) .filter((href: string | undefined) => !!href); } diff --git a/packages/browser-crawler/src/internals/browser-launcher.ts b/packages/browser-crawler/src/internals/browser-launcher.ts index 3ec11a1c6454..1a629db556bf 100644 --- a/packages/browser-crawler/src/internals/browser-launcher.ts +++ b/packages/browser-crawler/src/internals/browser-launcher.ts @@ -32,10 +32,10 @@ export interface BrowserLaunchContext extends BrowserPluginO useChrome?: boolean; /** - * With this option selected, all pages will be opened in a new incognito browser context. - * This means they will not share cookies nor cache and their resources will not be throttled by one another. - * @default false - */ + * With this option selected, all pages will be opened in a new incognito browser context. + * This means they will not share cookies nor cache and their resources will not be throttled by one another. + * @default false + */ useIncognitoPages?: boolean; /** @@ -46,10 +46,10 @@ export interface BrowserLaunchContext extends BrowserPluginO experimentalContainers?: boolean; /** - * Sets the [User Data Directory](https://chromium.googlesource.com/chromium/src/+/master/docs/user_data_dir.md) path. - * The user data directory contains profile data such as history, bookmarks, and cookies, as well as other per-installation local state. - * If not specified, a temporary directory is used instead. - */ + * Sets the [User Data Directory](https://chromium.googlesource.com/chromium/src/+/master/docs/user_data_dir.md) path. + * The user data directory contains profile data such as history, bookmarks, and cookies, as well as other per-installation local state. + * If not specified, a temporary directory is used instead. + */ userDataDir?: string; /** @@ -110,8 +110,9 @@ export abstract class BrowserLauncher< } catch (err) { const e = err as Error & { code: string }; if (e.code === 'MODULE_NOT_FOUND') { - const msg = `Cannot find module '${launcher}'. Did you you install the '${launcher}' package?\n` - + `Make sure you have '${launcher}' in your package.json dependencies and in your package-lock.json, if you use it.`; + const msg = + `Cannot find module '${launcher}'. Did you you install the '${launcher}' package?\n` + + `Make sure you have '${launcher}' in your package.json dependencies and in your package-lock.json, if you use it.`; if (process.env.APIFY_IS_AT_HOME) { e.message = `${msg}\nOn the Apify platform, '${launcher}' can only be used with the ${apifyImageName} Docker image.`; } @@ -124,7 +125,10 @@ export abstract class BrowserLauncher< /** * All `BrowserLauncher` parameters are passed via an launchContext object. */ - constructor(launchContext: BrowserLaunchContext, readonly config = Configuration.getGlobalConfig()) { + constructor( + launchContext: BrowserLaunchContext, + readonly config = Configuration.getGlobalConfig(), + ) { const { launcher, proxyUrl, diff --git a/packages/browser-crawler/test/migration.test.ts b/packages/browser-crawler/test/migration.test.ts index f8810d231af8..17ca066cd3a1 100644 --- a/packages/browser-crawler/test/migration.test.ts +++ b/packages/browser-crawler/test/migration.test.ts @@ -44,11 +44,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handlePageFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "requestHandler" and "handlePageFunction" were provided in the crawler options.`, - `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `As such, "requestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "requestHandler" and "handlePageFunction" were provided in the crawler options.`, + `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `As such, "requestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['userProvidedRequestHandler']).toBe(newHandler); @@ -70,10 +72,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handlePageFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handlePageFunction" to "requestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handlePageFunction" to "requestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['userProvidedRequestHandler']).toBe(oldHandler); @@ -122,11 +126,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `As such, "failedRequestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `As such, "failedRequestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(newHandler); @@ -149,10 +155,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(oldHandler); diff --git a/packages/browser-crawler/tsconfig.build.json b/packages/browser-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/browser-crawler/tsconfig.build.json +++ b/packages/browser-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/browser-pool/src/anonymize-proxy.ts b/packages/browser-pool/src/anonymize-proxy.ts index 1fc53728dc32..8c98d3a01a78 100644 --- a/packages/browser-pool/src/anonymize-proxy.ts +++ b/packages/browser-pool/src/anonymize-proxy.ts @@ -26,14 +26,8 @@ export const anonymizeProxySugar = async ( ]; } - return [ - undefined, - async () => {}, - ]; + return [undefined, async () => {}]; } - return [ - undefined, - async () => {}, - ]; + return [undefined, async () => {}]; }; diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 1d2ed50a7cc1..4ed51853a211 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -11,7 +11,11 @@ import { TypedEmitter } from 'tiny-typed-emitter'; import type { BrowserController } from './abstract-classes/browser-controller'; import type { BrowserPlugin } from './abstract-classes/browser-plugin'; import { BROWSER_POOL_EVENTS } from './events'; -import { createFingerprintPreLaunchHook, createPrePageCreateHook, createPostPageCreateHook } from './fingerprinting/hooks'; +import { + createFingerprintPreLaunchHook, + createPrePageCreateHook, + createPostPageCreateHook, +} from './fingerprinting/hooks'; import type { FingerprintGeneratorOptions } from './fingerprinting/types'; import type { LaunchContext } from './launch-context'; import { log } from './logger'; @@ -44,11 +48,11 @@ export interface FingerprintOptions { */ useFingerprintCache?: boolean; /** - * The maximum number of fingerprints that can be stored in the cache. - * - * Only relevant if `useFingerprintCache` is set to `true`. - * @default 10000 - */ + * The maximum number of fingerprints that can be stored in the cache. + * + * Only relevant if `useFingerprintCache` is set to `true`. + * @default 10000 + */ fingerprintCacheSize?: number; } @@ -117,7 +121,10 @@ export type PreLaunchHook = (pageId: string, launchCon * hooks complete. If you attempt to call `await browserController.close()` from * a post-launch hook, it will deadlock the process. This API is subject to change. */ -export type PostLaunchHook = (pageId: string, browserController: BC) => void | Promise; +export type PostLaunchHook = ( + pageId: string, + browserController: BC, +) => void | Promise; /** * Pre-page-create hooks are executed just before a new page is created. They @@ -128,10 +135,11 @@ export type PostLaunchHook = (pageId: string, brow * So far, new page options are only supported by `PlaywrightController` in incognito contexts. * If the page options are not supported by `BrowserController` the `pageOptions` argument is `undefined`. */ -export type PrePageCreateHook< - BC extends BrowserController, - PO = Parameters[0], -> = (pageId: string, browserController: BC, pageOptions?: PO) => void | Promise; +export type PrePageCreateHook[0]> = ( + pageId: string, + browserController: BC, + pageOptions?: PO, +) => void | Promise; /** * Post-page-create hooks are called right after a new page is created @@ -141,10 +149,10 @@ export type PrePageCreateHook< * The hooks are called with two arguments: * `page`: `Page` and `browserController`: {@apilink BrowserController} */ -export type PostPageCreateHook< - BC extends BrowserController, - Page = UnwrapPromise>, -> = (page: Page, browserController: BC) => void | Promise; +export type PostPageCreateHook>> = ( + page: Page, + browserController: BC, +) => void | Promise; /** * Pre-page-close hooks give you the opportunity to make last second changes @@ -153,17 +161,20 @@ export type PostPageCreateHook< * The hooks are called with two arguments: * `page`: `Page` and `browserController`: {@apilink BrowserController} */ -export type PrePageCloseHook< - BC extends BrowserController, - Page = UnwrapPromise>, -> = (page: Page, browserController: BC) => void | Promise; +export type PrePageCloseHook>> = ( + page: Page, + browserController: BC, +) => void | Promise; /** * Post-page-close hooks allow you to do page related clean up. * The hooks are called with two arguments: * `pageId`: `string` and `browserController`: {@apilink BrowserController} */ -export type PostPageCloseHook = (pageId: string, browserController: BC) => void | Promise; +export type PostPageCloseHook = ( + pageId: string, + browserController: BC, +) => void | Promise; export interface BrowserPoolHooks< BC extends BrowserController, @@ -275,7 +286,9 @@ export class BrowserPool< BrowserControllerReturn extends BrowserController = ReturnType, LaunchContextReturn extends LaunchContext = ReturnType, PageOptions = Parameters[0], - PageReturn extends UnwrapPromise> = UnwrapPromise>, + PageReturn extends UnwrapPromise> = UnwrapPromise< + ReturnType + >, > extends TypedEmitter> { browserPlugins: BrowserPlugins; maxOpenPagesPerBrowser: number; @@ -312,21 +325,24 @@ export class BrowserPool< this.browserKillerInterval!.unref(); - ow(options, ow.object.exactShape({ - browserPlugins: ow.array.minLength(1), - maxOpenPagesPerBrowser: ow.optional.number, - retireBrowserAfterPageCount: ow.optional.number, - operationTimeoutSecs: ow.optional.number, - closeInactiveBrowserAfterSecs: ow.optional.number, - preLaunchHooks: ow.optional.array, - postLaunchHooks: ow.optional.array, - prePageCreateHooks: ow.optional.array, - postPageCreateHooks: ow.optional.array, - prePageCloseHooks: ow.optional.array, - postPageCloseHooks: ow.optional.array, - useFingerprints: ow.optional.boolean, - fingerprintOptions: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + browserPlugins: ow.array.minLength(1), + maxOpenPagesPerBrowser: ow.optional.number, + retireBrowserAfterPageCount: ow.optional.number, + operationTimeoutSecs: ow.optional.number, + closeInactiveBrowserAfterSecs: ow.optional.number, + preLaunchHooks: ow.optional.array, + postLaunchHooks: ow.optional.array, + prePageCreateHooks: ow.optional.array, + postPageCreateHooks: ow.optional.array, + prePageCloseHooks: ow.optional.array, + postPageCloseHooks: ow.optional.array, + useFingerprints: ow.optional.boolean, + fingerprintOptions: ow.optional.object, + }), + ); const { browserPlugins, @@ -354,7 +370,9 @@ export class BrowserPool< const providedPluginName = (providedPlugin as BrowserPlugin).constructor.name; // eslint-disable-next-line max-len - throw new Error(`Browser plugin at index ${i} (${providedPluginName}) is not an instance of the same plugin as the first plugin provided (${firstPluginName}).`); + throw new Error( + `Browser plugin at index ${i} (${providedPluginName}) is not an instance of the same plugin as the first plugin provided (${firstPluginName}).`, + ); } } @@ -386,12 +404,7 @@ export class BrowserPool< * or their page limits have been exceeded. */ async newPage(options: BrowserPoolNewPageOptions = {}): Promise { - const { - id = nanoid(), - pageOptions, - browserPlugin = this._pickBrowserPlugin(), - proxyUrl, - } = options; + const { id = nanoid(), pageOptions, browserPlugin = this._pickBrowserPlugin(), proxyUrl } = options; if (this.pages.has(id)) { throw new Error(`Page with ID: ${id} already exists.`); @@ -416,13 +429,10 @@ export class BrowserPool< * browser to open the page in. Use the `launchOptions` option to * configure the new browser. */ - async newPageInNewBrowser(options: BrowserPoolNewPageInNewBrowserOptions = {}): Promise { - const { - id = nanoid(), - pageOptions, - launchOptions, - browserPlugin = this._pickBrowserPlugin(), - } = options; + async newPageInNewBrowser( + options: BrowserPoolNewPageInNewBrowserOptions = {}, + ): Promise { + const { id = nanoid(), pageOptions, launchOptions, browserPlugin = this._pickBrowserPlugin() } = options; if (this.pages.has(id)) { throw new Error(`Page with ID: ${id} already exists.`); @@ -515,9 +525,10 @@ export class BrowserPool< await browserController['isActivePromise']; tryCancel(); - const finalPageOptions = (browserController.launchContext.useIncognitoPages || browserController.launchContext.experimentalContainers) - ? pageOptions - : undefined; + const finalPageOptions = + browserController.launchContext.useIncognitoPages || browserController.launchContext.experimentalContainers + ? pageOptions + : undefined; if (finalPageOptions) { Object.assign(finalPageOptions, browserController.normalizeProxyOptions(proxyUrl, pageOptions)); @@ -529,11 +540,11 @@ export class BrowserPool< let page: PageReturn; try { - page = await addTimeoutToPromise( + page = (await addTimeoutToPromise( async () => browserController.newPage(finalPageOptions), this.operationTimeoutMillis, 'browserController.newPage() timed out.', - ) as PageReturn; + )) as PageReturn; tryCancel(); this.pages.set(pageId, page); @@ -548,7 +559,9 @@ export class BrowserPool< this._overridePageClose(page); } catch (err) { this.retireBrowserController(browserController); - throw new Error(`browserController.newPage() failed: ${browserController.id}\nCause:${(err as Error).message}.`); + throw new Error( + `browserController.newPage() failed: ${browserController.id}\nCause:${(err as Error).message}.`, + ); } await this._executeHooks(this.postPageCreateHooks, page, browserController); @@ -629,10 +642,7 @@ export class BrowserPool< } private async _launchBrowser(pageId: string, options: InternalLaunchBrowserOptions) { - const { - browserPlugin, - launchOptions, - } = options; + const { browserPlugin, launchOptions } = options; const browserController = browserPlugin.createController() as BrowserControllerReturn; this.activeBrowserControllers.add(browserController); @@ -664,10 +674,9 @@ export class BrowserPool< } catch (err) { this.activeBrowserControllers.delete(browserController); browserController.close().catch((closeErr) => { - log.error( - `Could not close browser whose post-launch hooks failed.\nCause:${closeErr.message}`, - { id: browserController.id }, - ); + log.error(`Could not close browser whose post-launch hooks failed.\nCause:${closeErr.message}`, { + id: browserController.id, + }); }); throw err; } @@ -734,10 +743,9 @@ export class BrowserPool< page.close = async (...args: unknown[]) => { await this._executeHooks(this.prePageCloseHooks, page, browserController); - await originalPageClose.apply(page, args) - .catch((err: Error) => { - log.debug(`Could not close page.\nCause:${err.message}`, { id: browserController.id }); - }); + await originalPageClose.apply(page, args).catch((err: Error) => { + log.debug(`Could not close page.\nCause:${err.message}`, { id: browserController.id }); + }); await this._executeHooks(this.postPageCloseHooks, pageId, browserController); @@ -786,14 +794,8 @@ export class BrowserPool< // It is usual to generate proxy per browser and we want to know the proxyUrl for the caching. createFingerprintPreLaunchHook(this), ]; - this.prePageCreateHooks = [ - createPrePageCreateHook(), - ...this.prePageCreateHooks, - ]; - this.postPageCreateHooks = [ - createPostPageCreateHook(this.fingerprintInjector!), - ...this.postPageCreateHooks, - ]; + this.prePageCreateHooks = [createPrePageCreateHook(), ...this.prePageCreateHooks]; + this.postPageCreateHooks = [createPostPageCreateHook(this.fingerprintInjector!), ...this.postPageCreateHooks]; } } diff --git a/packages/browser-pool/src/fingerprinting/hooks.ts b/packages/browser-pool/src/fingerprinting/hooks.ts index e7c74711d070..feb94b3a9d45 100644 --- a/packages/browser-pool/src/fingerprinting/hooks.ts +++ b/packages/browser-pool/src/fingerprinting/hooks.ts @@ -15,9 +15,7 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool { @@ -26,8 +24,9 @@ export function createFingerprintPreLaunchHook(browserPool: BrowserPool Note: If `browserListQuery` is passed, the `browsers` array is ignored. - */ + * Browser generation query based on the real world data. + * For more info see the [query docs](https://github.com/browserslist/browserslist#full-list). + * + * > Note: If `browserListQuery` is passed, the `browsers` array is ignored. + */ browserListQuery?: string; /** - * List of operating systems to generate the headers for. - */ + * List of operating systems to generate the headers for. + */ operatingSystems?: OperatingSystemsName[]; /** - * List of device types to generate the fingerprints for. - */ + * List of device types to generate the fingerprints for. + */ devices?: DeviceCategory[]; /** - * List of at most 10 languages to include in the - * [Accept-Language](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header - * in the language format accepted by that header, for example `en`, `en-US` or `de`. - */ + * List of at most 10 languages to include in the + * [Accept-Language](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Accept-Language) request header + * in the language format accepted by that header, for example `en`, `en-US` or `de`. + */ locales?: string[]; /** - * Http version to be used to generate headers (the headers differ depending on the version). - * - * Can be either 1 or 2. Default value is 2. - */ + * Http version to be used to generate headers (the headers differ depending on the version). + * + * Can be either 1 or 2. Default value is 2. + */ httpVersion?: HttpVersion; /** * Defines the screen dimensions of the generated fingerprint. @@ -59,7 +59,7 @@ const SUPPORTED_HTTP_VERSIONS = ['1', '2'] as const; /** * String specifying the HTTP version to use. */ -type HttpVersion = typeof SUPPORTED_HTTP_VERSIONS[number]; +type HttpVersion = (typeof SUPPORTED_HTTP_VERSIONS)[number]; export enum BrowserName { chrome = 'chrome', @@ -70,20 +70,20 @@ export enum BrowserName { export interface BrowserSpecification { /** - * String representing the browser name. - */ + * String representing the browser name. + */ name: BrowserName; /** - * Minimum version of browser used. - */ + * Minimum version of browser used. + */ minVersion?: number; /** - * Maximum version of browser used. - */ + * Maximum version of browser used. + */ maxVersion?: number; /** - * HTTP version to be used for header generation (the headers differ depending on the version). - */ + * HTTP version to be used for header generation (the headers differ depending on the version). + */ httpVersion?: HttpVersion; } diff --git a/packages/browser-pool/src/fingerprinting/utils.ts b/packages/browser-pool/src/fingerprinting/utils.ts index 2244c5f09e41..a0cadb1f35de 100644 --- a/packages/browser-pool/src/fingerprinting/utils.ts +++ b/packages/browser-pool/src/fingerprinting/utils.ts @@ -24,7 +24,8 @@ const getBrowserName = (browserPlugin: BrowserPlugin, launchOptions: any): Brows if (browserPlugin instanceof PlaywrightPlugin) { browserName = library.name!(); - } if (browserPlugin instanceof PuppeteerPlugin) { + } + if (browserPlugin instanceof PuppeteerPlugin) { browserName = launchOptions.product || library.product; } diff --git a/packages/browser-pool/src/playwright/playwright-controller.ts b/packages/browser-pool/src/playwright/playwright-controller.ts index bca1978f395c..8df42f7feb6b 100644 --- a/packages/browser-pool/src/playwright/playwright-controller.ts +++ b/packages/browser-pool/src/playwright/playwright-controller.ts @@ -10,7 +10,11 @@ import type { SafeParameters } from '../utils'; const tabIds = new WeakMap(); const keyFromTabId = (tabId: string | number) => `.${tabId}.`; -export class PlaywrightController extends BrowserController[0], Browser> { +export class PlaywrightController extends BrowserController< + BrowserType, + SafeParameters[0], + Browser +> { normalizeProxyOptions(proxyUrl: string | undefined, pageOptions: any): Record { if (!proxyUrl) { return {}; @@ -31,8 +35,14 @@ export class PlaywrightController extends BrowserController[0]): Promise { - if (contextOptions !== undefined && !this.launchContext.useIncognitoPages && !this.launchContext.experimentalContainers) { - throw new Error('A new page can be created with provided context only when using incognito pages or experimental containers.'); + if ( + contextOptions !== undefined && + !this.launchContext.useIncognitoPages && + !this.launchContext.experimentalContainers + ) { + throw new Error( + 'A new page can be created with provided context only when using incognito pages or experimental containers.', + ); } let close = async () => {}; @@ -66,7 +76,9 @@ export class PlaywrightController extends BrowserController { return new Promise((resolve, reject) => { - const server = net.createServer().once('error', reject).listen(() => { - resolve((server.address() as net.AddressInfo).port); - server.close(); - }); + const server = net + .createServer() + .once('error', reject) + .listen(() => { + resolve((server.address() as net.AddressInfo).port); + server.close(); + }); }); }; @@ -30,20 +33,18 @@ const getFreePort = async () => { // taacPath = browser-pool/dist/tab-as-a-container const taacPath = path.join(__dirname, '..', 'tab-as-a-container'); -export class PlaywrightPlugin extends BrowserPlugin[0], PlaywrightBrowser> { +export class PlaywrightPlugin extends BrowserPlugin< + BrowserType, + SafeParameters[0], + PlaywrightBrowser +> { private _browserVersion?: string; _containerProxyServer?: Awaited>; protected async _launch(launchContext: LaunchContext): Promise { - const { - launchOptions, - useIncognitoPages, - proxyUrl, - } = launchContext; + const { launchOptions, useIncognitoPages, proxyUrl } = launchContext; - let { - userDataDir, - } = launchContext; + let { userDataDir } = launchContext; let browser: PlaywrightBrowser; @@ -82,9 +83,7 @@ export class PlaywrightPlugin extends BrowserPlugin { - return this._throwOnFailedLaunch(launchContext, error); - }); + const browserContext = await this.library + .launchPersistentContext(userDataDir, launchOptions) + .catch((error) => { + return this._throwOnFailedLaunch(launchContext, error); + }); browserContext.once('close', () => { if (userDataDir.includes('apify-playwright-firefox-taac-')) { @@ -192,7 +196,11 @@ export class PlaywrightPlugin extends BrowserPlugin[0], PlaywrightBrowser> { + protected _createController(): BrowserController< + BrowserType, + SafeParameters[0], + PlaywrightBrowser + > { return new PlaywrightController(this); } diff --git a/packages/browser-pool/src/puppeteer/puppeteer-controller.ts b/packages/browser-pool/src/puppeteer/puppeteer-controller.ts index cd0a1c535fa2..dd7621800599 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-controller.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-controller.ts @@ -40,7 +40,9 @@ export class PuppeteerController extends BrowserController< protected async _newPage(contextOptions?: PuppeteerNewPageOptions): Promise { if (contextOptions !== undefined) { if (!this.launchContext.useIncognitoPages) { - throw new Error('A new page can be created with provided context only when using incognito pages or experimental containers.'); + throw new Error( + 'A new page can be created with provided context only when using incognito pages or experimental containers.', + ); } let close = async () => {}; @@ -65,7 +67,7 @@ export class PuppeteerController extends BrowserController< const { CdpBrowser } = await import('puppeteer'); const oldPuppeteerVersion = !CdpBrowser || 'createIncognitoBrowserContext' in CdpBrowser.prototype; const method = oldPuppeteerVersion ? 'createIncognitoBrowserContext' : 'createBrowserContext'; - const context = await (this.browser as any)[method](contextOptions) as PuppeteerTypes.BrowserContext; + const context = (await (this.browser as any)[method](contextOptions)) as PuppeteerTypes.BrowserContext; tryCancel(); const page = await context.newPage(); tryCancel(); diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 07b63400e39a..bf7e248aff21 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -20,18 +20,17 @@ export class PuppeteerPlugin extends BrowserPlugin< PuppeteerNewPageOptions > { protected async _launch( - launchContext: LaunchContext, + launchContext: LaunchContext< + typeof Puppeteer, + PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + >, ): Promise { // @ts-expect-error not exposed on type level const { CdpBrowser } = await import('puppeteer'); const oldPuppeteerVersion = !CdpBrowser || 'createIncognitoBrowserContext' in CdpBrowser.prototype; - const { - launchOptions, - userDataDir, - useIncognitoPages, - experimentalContainers, - proxyUrl, - } = launchContext; + const { launchOptions, userDataDir, useIncognitoPages, experimentalContainers, proxyUrl } = launchContext; if (experimentalContainers) { throw new Error('Experimental containers are only available with Playwright'); @@ -101,26 +100,36 @@ export class PuppeteerPlugin extends BrowserPlugin< } }); - const boundMethods = (['newPage', 'close', 'userAgent', 'createIncognitoBrowserContext', 'createBrowserContext', 'version', 'on', 'process'] as const) - .reduce((map, method) => { - map[method] = browser[method as 'close']?.bind(browser); - return map; - }, {} as Dictionary); + const boundMethods = ( + [ + 'newPage', + 'close', + 'userAgent', + 'createIncognitoBrowserContext', + 'createBrowserContext', + 'version', + 'on', + 'process', + ] as const + ).reduce((map, method) => { + map[method] = browser[method as 'close']?.bind(browser); + return map; + }, {} as Dictionary); const method = oldPuppeteerVersion ? 'createIncognitoBrowserContext' : 'createBrowserContext'; browser = new Proxy(browser, { get: (target, property: keyof typeof browser, receiver) => { if (property === 'newPage') { - return (async (...args: Parameters) => { + return async (...args: Parameters) => { let page: PuppeteerTypes.Page; if (useIncognitoPages) { const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl); try { - const context = await (browser as any)[method]({ + const context = (await (browser as any)[method]({ proxyServer: anonymizedProxyUrl ?? proxyUrl, - }) as PuppeteerTypes.BrowserContext; + })) as PuppeteerTypes.BrowserContext; page = await context.newPage(...args); @@ -156,7 +165,7 @@ export class PuppeteerPlugin extends BrowserPlugin< */ return page; - }); + }; } if (property in boundMethods) { @@ -170,12 +179,22 @@ export class PuppeteerPlugin extends BrowserPlugin< return browser; } - protected _createController(): BrowserController { + protected _createController(): BrowserController< + typeof Puppeteer, + PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + > { return new PuppeteerController(this); } protected async _addProxyToLaunchOptions( - _launchContext: LaunchContext, + _launchContext: LaunchContext< + typeof Puppeteer, + PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + >, ): Promise { /* // DO NOT USE YET! DOING SO DISABLES CACHE WHICH IS 50% PERFORMANCE HIT! @@ -205,7 +224,12 @@ export class PuppeteerPlugin extends BrowserPlugin< } protected _isChromiumBasedBrowser( - _launchContext: LaunchContext, + _launchContext: LaunchContext< + typeof Puppeteer, + PuppeteerTypes.PuppeteerLaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + >, ): boolean { return true; } diff --git a/packages/browser-pool/src/utils.ts b/packages/browser-pool/src/utils.ts index 1130302bbaf2..45d699590dfc 100644 --- a/packages/browser-pool/src/utils.ts +++ b/packages/browser-pool/src/utils.ts @@ -17,30 +17,28 @@ export type InferBrowserPluginArray< Input extends readonly unknown[], // The results of this type Result extends BrowserPlugin[] = [], -> = - // If the input is a tuple or a readonly array (`[] as const`), get the first and the rest of the values - Input extends readonly [infer FirstValue, ...infer Rest] | [infer FirstValue, ...infer Rest] - // If the first value is a PlaywrightPlugin - ? FirstValue extends PlaywrightPlugin - // Add it to the result, and continue parsing - ? InferBrowserPluginArray - // Else if the first value is a PuppeteerPlugin - : FirstValue extends PuppeteerPlugin - // Add it to the result, and continue parsing - ? InferBrowserPluginArray - // Return never as it isn't a valid type - : never - // If there's no more inputs to parse - : Input extends [] - // Return the results - ? Result - // If the input is a general array of elements (not a tuple), infer it's values type - : Input extends readonly (infer U)[] - // If the values are a union of the plugins - ? [U] extends [PuppeteerPlugin | PlaywrightPlugin] - // Return an array of the union - ? U[] - // Return never as it isn't a valid type - : never - // Return the result - : Result; +> = Input extends readonly [infer FirstValue, ...infer Rest] | [infer FirstValue, ...infer Rest] // If the input is a tuple or a readonly array (`[] as const`), get the first and the rest of the values + ? // If the first value is a PlaywrightPlugin + FirstValue extends PlaywrightPlugin + ? // Add it to the result, and continue parsing + InferBrowserPluginArray + : // Else if the first value is a PuppeteerPlugin + FirstValue extends PuppeteerPlugin + ? // Add it to the result, and continue parsing + InferBrowserPluginArray + : // Return never as it isn't a valid type + never + : // If there's no more inputs to parse + Input extends [] + ? // Return the results + Result + : // If the input is a general array of elements (not a tuple), infer it's values type + Input extends readonly (infer U)[] + ? // If the values are a union of the plugins + [U] extends [PuppeteerPlugin | PlaywrightPlugin] + ? // Return an array of the union + U[] + : // Return never as it isn't a valid type + never + : // Return the result + Result; diff --git a/packages/browser-pool/tab-as-a-container/background.js b/packages/browser-pool/tab-as-a-container/background.js index 42e25f06c4c8..d95f20b28591 100644 --- a/packages/browser-pool/tab-as-a-container/background.js +++ b/packages/browser-pool/tab-as-a-container/background.js @@ -95,14 +95,18 @@ chrome.webRequest.onBeforeSendHeaders.addListener( if (header.name.toLowerCase() === 'cookie') { const id = keyFromTabId(getOpenerId(details.tabId)); - const fixedCookies = header.value.split('; ').filter((x) => x.startsWith(id)).map((x) => x.slice(id.length)).join('; '); + const fixedCookies = header.value + .split('; ') + .filter((x) => x.startsWith(id)) + .map((x) => x.slice(id.length)) + .join('; '); header.value = fixedCookies; } // Sometimes Chrome makes a request on a ghost tab. // We don't want these in order to prevent cluttering cookies. // Yes, `webNavigation.onComitted` is emitted and `webNavigation.onCreatedNavigationTarget` is not. - if (header.name.toLowerCase() === 'purpose' && header.value === 'prefetch' && !(counter.has(details.tabId))) { + if (header.name.toLowerCase() === 'purpose' && header.value === 'prefetch' && !counter.has(details.tabId)) { // eslint-disable-next-line no-console console.log(details); return { @@ -111,7 +115,7 @@ chrome.webRequest.onBeforeSendHeaders.addListener( } // This one is for Firefox - if (header.name.toLowerCase() === 'x-moz' && header.value === 'prefetch' && !(counter.has(details.tabId))) { + if (header.name.toLowerCase() === 'x-moz' && header.value === 'prefetch' && !counter.has(details.tabId)) { // eslint-disable-next-line no-console console.log(details); return { @@ -134,7 +138,9 @@ chrome.webRequest.onBeforeSendHeaders.addListener( } return { - requestHeaders: details.requestHeaders.filter((header) => header.name.toLowerCase() !== 'cookie' || header.value !== ''), + requestHeaders: details.requestHeaders.filter( + (header) => header.name.toLowerCase() !== 'cookie' || header.value !== '', + ), }; }, { urls: [''] }, @@ -152,13 +158,15 @@ chrome.webRequest.onHeadersReceived.addListener( const openerId = getOpenerId(details.tabId); - header.value = parts.map((part) => { - const equalsIndex = part.indexOf('='); - if (equalsIndex === -1) { - return `${keyFromTabId(openerId)}=${part.trimStart()}`; - } - return keyFromTabId(openerId) + part.trimStart(); - }).join('\n'); + header.value = parts + .map((part) => { + const equalsIndex = part.indexOf('='); + if (equalsIndex === -1) { + return `${keyFromTabId(openerId)}=${part.trimStart()}`; + } + return keyFromTabId(openerId) + part.trimStart(); + }) + .join('\n'); } } @@ -187,13 +195,17 @@ chrome.tabs.onRemoved.addListener(async (tabId) => { const id = keyFromTabId(opener); chrome.cookies.getAll({}, async (cookies) => { - await Promise.allSettled(cookies.filter((cookie) => cookie.name.startsWith(id)).map((cookie) => { - return chrome.cookies.remove({ - name: cookie.name, - url: getCookieURL(cookie), - storeId: cookie.storeId, - }); - })); + await Promise.allSettled( + cookies + .filter((cookie) => cookie.name.startsWith(id)) + .map((cookie) => { + return chrome.cookies.remove({ + name: cookie.name, + url: getCookieURL(cookie), + storeId: cookie.storeId, + }); + }), + ); }); }); @@ -246,7 +258,7 @@ const getNextLocalhostIp = (openerId) => { } // [127.0.0.1 - 127.255.255.254] = 1 * 255 * 255 * 254 = 16 516 350 - while (localhostIpCache.length >= (1 * 255 * 255 * 254)) { + while (localhostIpCache.length >= 1 * 255 * 255 * 254) { localhostIpCache.delete(localhostIpCache.keys().next().value); } @@ -341,7 +353,9 @@ const onCompleted = async (details) => { // Different protocols are required, otherwise `onCompleted` won't be emitted. const result = await routes[route](details, body); if (result !== undefined) { - await chrome.tabs.update(details.tabId, { url: `about:blank#${encodeURIComponent(JSON.stringify(result))}` }); + await chrome.tabs.update(details.tabId, { + url: `about:blank#${encodeURIComponent(JSON.stringify(result))}`, + }); } } } catch { @@ -400,7 +414,9 @@ chrome.webNavigation.onCompleted.addListener(onCompleted); window.totallyRandomString = true; - const code = "'use strict'; const tabId = '${getOpenerId(details.tabId)}'; (() => {\\n" + ${JSON.stringify(contentText)} + "\\n})();\\n"; + const code = "'use strict'; const tabId = '${getOpenerId( + details.tabId, + )}'; (() => {\\n" + ${JSON.stringify(contentText)} + "\\n})();\\n"; ${executeCodeInPageContext} })(); `, diff --git a/packages/browser-pool/tab-as-a-container/content.js b/packages/browser-pool/tab-as-a-container/content.js index 4635bf27daa0..0ed1ef809d59 100644 --- a/packages/browser-pool/tab-as-a-container/content.js +++ b/packages/browser-pool/tab-as-a-container/content.js @@ -177,7 +177,9 @@ const FakeStorage = class Storage { } if (arguments.length === 0) { - throw fixStack(new TypeError(`Failed to execute 'key' on 'Storage': 1 argument required, but only 0 present.`)); + throw fixStack( + new TypeError(`Failed to execute 'key' on 'Storage': 1 argument required, but only 0 present.`), + ); } index = NumberIsFinite(index) ? index : 0; @@ -208,7 +210,9 @@ const FakeStorage = class Storage { } if (arguments.length === 0) { - throw fixStack(new TypeError(`Failed to execute 'getItem' on 'Storage': 1 argument required, but only 0 present.`)); + throw fixStack( + new TypeError(`Failed to execute 'getItem' on 'Storage': 1 argument required, but only 0 present.`), + ); } return StoragePrototype.getItem.call(priv.storage, priv.prefix + key); @@ -221,7 +225,9 @@ const FakeStorage = class Storage { } if (arguments.length === 0) { - throw fixStack(new TypeError(`Failed to execute 'removeItem' on 'Storage': 1 argument required, but only 0 present.`)); + throw fixStack( + new TypeError(`Failed to execute 'removeItem' on 'Storage': 1 argument required, but only 0 present.`), + ); } StoragePrototype.removeItem.call(priv.storage, priv.prefix + key); @@ -234,7 +240,11 @@ const FakeStorage = class Storage { } if (arguments.length === 0 || arguments.length === 1) { - throw fixStack(new TypeError(`Failed to execute 'setItem' on 'Storage': 2 arguments required, but only ${arguments.length} present.`)); + throw fixStack( + new TypeError( + `Failed to execute 'setItem' on 'Storage': 2 arguments required, but only ${arguments.length} present.`, + ), + ); } StoragePrototype.setItem.call(priv.storage, priv.prefix + key, value); @@ -257,7 +267,9 @@ const createStorage = ({ storage, prefix }) => { // getPrototypeOf: (target) => {}, defineProperty: (target, key, descriptor) => { if ('set' in descriptor || 'get' in descriptor) { - throw fixStack(new TypeError(`Failed to set a named property on 'Storage': Accessor properties are not allowed.`)); + throw fixStack( + new TypeError(`Failed to set a named property on 'Storage': Accessor properties are not allowed.`), + ); } FakeStoragePrototype.setItem.call(target, key, descriptor.value); @@ -363,17 +375,18 @@ const createStorage = ({ storage, prefix }) => { const toHide = new WeakMap(); for (const Type of [Function, Object, Array]) { - const create = (fallback) => function () { - if (this instanceof FakeStorage) { - return '[object Storage]'; - } + const create = (fallback) => + function () { + if (this instanceof FakeStorage) { + return '[object Storage]'; + } - if (WeakMapPrototype.has.call(toHide, this)) { - return `function ${WeakMapPrototype.get.call(toHide, this)}() { [native code] }`; - } + if (WeakMapPrototype.has.call(toHide, this)) { + return `function ${WeakMapPrototype.get.call(toHide, this)}() { [native code] }`; + } - return fallback.call(this); - }; + return fallback.call(this); + }; const toString = create(Type.prototype.toString); const toLocaleString = create(Type.prototype.toLocaleString); @@ -400,8 +413,12 @@ try { const fakeLocalStorage = createStorage({ storage: sessionStorage, prefix: 'l.' }); const fakeSessionStorage = createStorage({ storage: sessionStorage, prefix: 's.' }); - const getLocalStorage = function localStorage() { return fakeLocalStorage; }; - const getSessionStorage = function sessionStorage() { return fakeSessionStorage; }; + const getLocalStorage = function localStorage() { + return fakeLocalStorage; + }; + const getSessionStorage = function sessionStorage() { + return fakeSessionStorage; + }; WeakMapPrototype.set.call(toHide, FakeStorage, 'Storage'); WeakMapPrototype.set.call(toHide, FakeStoragePrototype.key, 'key'); @@ -450,7 +467,9 @@ try { const getCookie = function cookie() { try { const cookies = StringSplitSafe(realGetCookie.call(this), '; '); - const filtered = ArrayPrototype.filter.call(cookies, (cookie) => StringPrototype.startsWith.call(cookie, tabPrefix)); + const filtered = ArrayPrototype.filter.call(cookies, (cookie) => + StringPrototype.startsWith.call(cookie, tabPrefix), + ); const mapped = ArrayPrototype.map.call(filtered, (cookie) => { const result = StringPrototype.slice.call(cookie, tabPrefix.length); @@ -472,7 +491,7 @@ try { const delimiterIndex = StringPrototype.indexOf.call(cookieString, ';'); const equalsIndex = StringPrototype.indexOf.call(cookieString, '='); - if ((equalsIndex === -1) || ((delimiterIndex !== -1) && (equalsIndex > delimiterIndex))) { + if (equalsIndex === -1 || (delimiterIndex !== -1 && equalsIndex > delimiterIndex)) { cookieString = `=${cookieString}`; } diff --git a/packages/browser-pool/tab-as-a-container/manifest.json b/packages/browser-pool/tab-as-a-container/manifest.json index b7a38b8ec7b2..cc77a982f9a9 100644 --- a/packages/browser-pool/tab-as-a-container/manifest.json +++ b/packages/browser-pool/tab-as-a-container/manifest.json @@ -3,9 +3,7 @@ "name": "Tab as a Container", "version": "1.0.0", "background": { - "scripts": [ - "background.js" - ], + "scripts": ["background.js"], "persistent": true }, "permissions": [ @@ -18,8 +16,6 @@ "proxy", "" ], - "web_accessible_resources": [ - "content.js" - ], + "web_accessible_resources": ["content.js"], "incognito": "not_allowed" } diff --git a/packages/browser-pool/test/changing-page-options.test.ts b/packages/browser-pool/test/changing-page-options.test.ts index 6aa4ccc18c7d..76aed3042a09 100644 --- a/packages/browser-pool/test/changing-page-options.test.ts +++ b/packages/browser-pool/test/changing-page-options.test.ts @@ -12,17 +12,20 @@ import { createProxyServer } from '../../../test/browser-pool/browser-plugins/cr describe.each([ ['Puppeteer', new PuppeteerPlugin(puppeteer, { useIncognitoPages: true })], - ['Playwright', new PlaywrightPlugin(playwright.chromium, { - useIncognitoPages: true, - launchOptions: { - args: [ - // Exclude loopback interface from proxy bypass list, - // so the request to localhost goes through proxy. - // This way there's no need for a 3rd party server. - '--proxy-bypass-list=<-loopback>', - ], - }, - })], // Chromium is faster than firefox and webkit + [ + 'Playwright', + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: true, + launchOptions: { + args: [ + // Exclude loopback interface from proxy bypass list, + // so the request to localhost goes through proxy. + // This way there's no need for a 3rd party server. + '--proxy-bypass-list=<-loopback>', + ], + }, + }), + ], // Chromium is faster than firefox and webkit ])('BrowserPool - %s - prePageCreateHooks > should allow changing pageOptions', (_, plugin) => { let target: http.Server; let protectedProxy: ProxyChainServer; @@ -44,7 +47,11 @@ describe.each([ }); test('should allow changing pageOptions', async () => { - const hook: PrePageCreateHook = (_pageId, _controller, pageOptions) => { + const hook: PrePageCreateHook = ( + _pageId, + _controller, + pageOptions, + ) => { if (!pageOptions) { expect(false).toBe(true); return; diff --git a/packages/browser-pool/test/multiple-plugins.test.ts b/packages/browser-pool/test/multiple-plugins.test.ts index 6390c0a390e3..cfdcbc0d7963 100644 --- a/packages/browser-pool/test/multiple-plugins.test.ts +++ b/packages/browser-pool/test/multiple-plugins.test.ts @@ -2,17 +2,17 @@ import { BrowserPool, PlaywrightPlugin } from '@crawlee/browser-pool'; import playwright from 'playwright'; describe('BrowserPool - Using multiple plugins', () => { - let browserPool: BrowserPool<{ browserPlugins: [PlaywrightPlugin, PlaywrightPlugin]; closeInactiveBrowserAfterSecs: 2 }>; + let browserPool: BrowserPool<{ + browserPlugins: [PlaywrightPlugin, PlaywrightPlugin]; + closeInactiveBrowserAfterSecs: 2; + }>; const chromePlugin = new PlaywrightPlugin(playwright.chromium); const firefoxPlugin = new PlaywrightPlugin(playwright.firefox); beforeEach(async () => { vitest.clearAllMocks(); browserPool = new BrowserPool({ - browserPlugins: [ - chromePlugin, - firefoxPlugin, - ], + browserPlugins: [chromePlugin, firefoxPlugin], closeInactiveBrowserAfterSecs: 2, }); }); @@ -31,10 +31,7 @@ describe('BrowserPool - Using multiple plugins', () => { }); test('should loop through plugins round-robin', async () => { - const correctPluginOrder = [ - chromePlugin, - firefoxPlugin, - ]; + const correctPluginOrder = [chromePlugin, firefoxPlugin]; const pagePromises = correctPluginOrder.map(async () => browserPool.newPage()); diff --git a/packages/browser-pool/test/no-hybrid-plugins.test.ts b/packages/browser-pool/test/no-hybrid-plugins.test.ts index 503d47270c64..fcccc52dbc10 100644 --- a/packages/browser-pool/test/no-hybrid-plugins.test.ts +++ b/packages/browser-pool/test/no-hybrid-plugins.test.ts @@ -4,15 +4,23 @@ import puppeteer from 'puppeteer'; describe('Hybrid BrowserPool plugins should not be allowed', () => { test('mixing Puppeteer with Playwright should throw an error', () => { - expect(() => new BrowserPool({ - browserPlugins: [new PuppeteerPlugin(puppeteer), new PlaywrightPlugin(playwright.chromium)], - }), + expect( + () => + new BrowserPool({ + browserPlugins: [new PuppeteerPlugin(puppeteer), new PlaywrightPlugin(playwright.chromium)], + }), ).toThrowError(); }); test('providing multiple different Playwright plugins should not throw an error', () => { - expect(() => new BrowserPool({ - browserPlugins: [new PlaywrightPlugin(playwright.chromium), new PlaywrightPlugin(playwright.firefox)], - })).not.toThrowError(); + expect( + () => + new BrowserPool({ + browserPlugins: [ + new PlaywrightPlugin(playwright.chromium), + new PlaywrightPlugin(playwright.firefox), + ], + }), + ).not.toThrowError(); }); }); diff --git a/packages/browser-pool/tsconfig.build.json b/packages/browser-pool/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/browser-pool/tsconfig.build.json +++ b/packages/browser-pool/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index 205aa904d350..157fb9c67c3e 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -12,12 +12,7 @@ import type { Configuration, RequestProvider, } from '@crawlee/http'; -import { - HttpCrawler, - enqueueLinks, - Router, - resolveBaseUrlForEnqueueLinksFiltering, -} from '@crawlee/http'; +import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http'; import type { Dictionary } from '@crawlee/types'; import { extractUrlsFromCheerio } from '@crawlee/utils'; import type { CheerioOptions } from 'cheerio'; @@ -28,22 +23,22 @@ import { WritableStream } from 'htmlparser2/lib/WritableStream'; export type CheerioErrorHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = ErrorHandler>; +> = ErrorHandler>; export interface CheerioCrawlerOptions< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends HttpCrawlerOptions> {} +> extends HttpCrawlerOptions> {} export type CheerioHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = InternalHttpHook>; +> = InternalHttpHook>; export interface CheerioCrawlingContext< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends InternalHttpCrawlingContext { +> extends InternalHttpCrawlingContext { /** * The [Cheerio](https://cheerio.js.org/) object with parsed HTML. * Cheerio is available only for HTML and XML content types. @@ -69,7 +64,7 @@ export interface CheerioCrawlingContext< export type CheerioRequestHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = RequestHandler>; +> = RequestHandler>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests and @@ -157,16 +152,23 @@ export class CheerioCrawler extends HttpCrawler { super(options, config); } - protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: CheerioCrawlingContext) { + protected override async _parseHTML( + response: IncomingMessage, + isXml: boolean, + crawlingContext: CheerioCrawlingContext, + ) { const dom = await this._parseHtmlToDom(response, isXml); - const $ = cheerio.load(dom as string, { - xmlMode: isXml, - // Recent versions of cheerio use parse5 as the HTML parser/serializer. It's more strict than htmlparser2 - // and not good for scraping. It also does not have a great streaming interface. - // Here we tell cheerio to use htmlparser2 for serialization, otherwise the conflict produces weird errors. - _useHtmlParser2: true, - } as CheerioOptions); + const $ = cheerio.load( + dom as string, + { + xmlMode: isXml, + // Recent versions of cheerio use parse5 as the HTML parser/serializer. It's more strict than htmlparser2 + // and not good for scraping. It also does not have a great streaming interface. + // Here we tell cheerio to use htmlparser2 for serialization, otherwise the conflict produces weird errors. + _useHtmlParser2: true, + } as CheerioOptions, + ); return { dom, @@ -188,15 +190,16 @@ export class CheerioCrawler extends HttpCrawler { protected async _parseHtmlToDom(response: IncomingMessage, isXml: boolean) { return new Promise((resolve, reject) => { - const domHandler = new DomHandler((err, dom) => { - if (err) reject(err); - else resolve(dom); - }, { xmlMode: isXml }); + const domHandler = new DomHandler( + (err, dom) => { + if (err) reject(err); + else resolve(dom); + }, + { xmlMode: isXml }, + ); const parser = new WritableStream(domHandler, { decodeEntities: true, xmlMode: isXml }); parser.on('error', reject); - response - .on('error', reject) - .pipe(parser); + response.on('error', reject).pipe(parser); }); } @@ -215,7 +218,13 @@ interface EnqueueLinksInternalOptions { } /** @internal */ -export async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, originalRequestUrl, finalRequestUrl }: EnqueueLinksInternalOptions) { +export async function cheerioCrawlerEnqueueLinks({ + options, + $, + requestQueue, + originalRequestUrl, + finalRequestUrl, +}: EnqueueLinksInternalOptions) { if (!$) { throw new Error('Cannot enqueue links because the DOM is not available.'); } @@ -227,7 +236,11 @@ export async function cheerioCrawlerEnqueueLinks({ options, $, requestQueue, ori userProvidedBaseUrl: options?.baseUrl, }); - const urls = extractUrlsFromCheerio($, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); + const urls = extractUrlsFromCheerio( + $, + options?.selector ?? 'a', + options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl, + ); return enqueueLinks({ requestQueue, diff --git a/packages/cheerio-crawler/test/migration.test.ts b/packages/cheerio-crawler/test/migration.test.ts index 215b1e619603..ce0698a82f62 100644 --- a/packages/cheerio-crawler/test/migration.test.ts +++ b/packages/cheerio-crawler/test/migration.test.ts @@ -36,11 +36,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handlePageFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "requestHandler" and "handlePageFunction" were provided in the crawler options.`, - `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `As such, "requestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "requestHandler" and "handlePageFunction" were provided in the crawler options.`, + `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `As such, "requestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandler']).toBe(newHandler); @@ -56,10 +58,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handlePageFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handlePageFunction" to "requestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handlePageFunction" has been renamed to "requestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handlePageFunction" to "requestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['requestHandler']).toBe(oldHandler); @@ -96,11 +100,13 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `As such, "failedRequestHandler" will be used instead.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `Both "failedRequestHandler" and "handleFailedRequestFunction" were provided in the crawler options.`, + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `As such, "failedRequestHandler" will be used instead.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(newHandler); @@ -117,10 +123,12 @@ describe('Moving from handleRequest* to requestHandler*', () => { handleFailedRequestFunction: oldHandler, }); - expect(warningSpy).toHaveBeenCalledWith<[string]>([ - `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, - `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, - ].join('\n')); + expect(warningSpy).toHaveBeenCalledWith<[string]>( + [ + `"handleFailedRequestFunction" has been renamed to "failedRequestHandler", and will be removed in a future version.`, + `The provided value will be used, but you should rename "handleFailedRequestFunction" to "failedRequestHandler" in your crawler options.`, + ].join('\n'), + ); // eslint-disable-next-line dot-notation -- accessing private property expect(crawler['failedRequestHandler']).toBe(oldHandler); diff --git a/packages/cheerio-crawler/tsconfig.build.json b/packages/cheerio-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/cheerio-crawler/tsconfig.build.json +++ b/packages/cheerio-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/cli/src/commands/CreateProjectCommand.ts b/packages/cli/src/commands/CreateProjectCommand.ts index cac793bd47b1..76b363b4af11 100644 --- a/packages/cli/src/commands/CreateProjectCommand.ts +++ b/packages/cli/src/commands/CreateProjectCommand.ts @@ -35,42 +35,54 @@ async function rewrite(path: string, replacer: (from: string) => string) { } } -async function withRetries unknown>(func: F, retries: number, label: string): Promise>> { +async function withRetries unknown>( + func: F, + retries: number, + label: string, +): Promise>> { let attempt = 0; let lastError: any; while (attempt < retries) { try { - return await func() as Awaited>; + return (await func()) as Awaited>; } catch (error: any) { attempt++; lastError = error; if (attempt < retries) { - console.warn(`${colors.yellow(`[${label}]`)}: Attempt ${attempt + 1} of ${retries} failed, and will be retried`, error.message || error); + console.warn( + `${colors.yellow(`[${label}]`)}: Attempt ${attempt + 1} of ${retries} failed, and will be retried`, + error.message || error, + ); } // Wait 2500ms + (2500 * retries) before giving up to give it some time between retries - await setTimeout(2500 + (2500 * attempt)); + await setTimeout(2500 + 2500 * attempt); } } - throw new Error(`${colors.red(`[${label}]`)}: All ${retries} attempts failed, and will not be retried\n\n${lastError.stack || lastError}`); + throw new Error( + `${colors.red(`[${label}]`)}: All ${retries} attempts failed, and will not be retried\n\n${ + lastError.stack || lastError + }`, + ); } async function downloadTemplateFilesToDisk(template: Template, destinationDirectory: string) { const promises: Promise[] = []; for (const file of template.files) { - const promise = async () => downloadFile(file.url).then(async (buffer) => { - // Make sure the folder for the file exists - const fileDirName = dirname(file.path); - const fileFolder = resolve(destinationDirectory, fileDirName); - await ensureDir(fileFolder); - - // Write the actual file - await writeFile(resolve(destinationDirectory, file.path), buffer); - }); + const promise = async () => + downloadFile(file.url).then(async (buffer) => { + // Make sure the folder for the file exists + const fileDirName = dirname(file.path); + const fileFolder = resolve(destinationDirectory, fileDirName); + await ensureDir(fileFolder); + + // Write the actual file + await writeFile(resolve(destinationDirectory, file.path), buffer); + }); promises.push(withRetries(promise, 3, `Template: ${template.name}, file: ${file.path}`)); } @@ -127,19 +139,21 @@ export class CreateProjectCommand implements CommandModule { - try { - validateProjectName(promptText); - } catch (err: any) { - return err.message; - } - return true; + const projectNamePrompt = await prompt([ + { + name: 'projectName', + message: 'Name of the new project folder:', + type: 'input', + validate: (promptText) => { + try { + validateProjectName(promptText); + } catch (err: any) { + return err.message; + } + return true; + }, }, - }]); + ]); ({ projectName } = projectNamePrompt); } else { validateProjectName(projectName); @@ -152,13 +166,15 @@ export class CreateProjectCommand implements CommandModule implements CommandModule item.name === template)!; await downloadTemplateFilesToDisk(templateData, projectDir); - await rewrite(resolve(projectDir, 'package.json'), (pkg) => pkg.replace(/"name": "[\w-]+"/, `"name": "${projectName}"`)); + await rewrite(resolve(projectDir, 'package.json'), (pkg) => + pkg.replace(/"name": "[\w-]+"/, `"name": "${projectName}"`), + ); // Run npm install in project dir. const npm = /^win/.test(process.platform) ? 'npm.cmd' : 'npm'; execSync(`${npm} install`, { cwd: projectDir, stdio: 'inherit' }); - console.log(colors.green(`Project ${projectName} was created. To run it, run "cd ${projectName}" and "npm start".`)); + console.log( + colors.green(`Project ${projectName} was created. To run it, run "cd ${projectName}" and "npm start".`), + ); } } diff --git a/packages/cli/src/commands/InstallPlaywrightBrowsersCommand.ts b/packages/cli/src/commands/InstallPlaywrightBrowsersCommand.ts index 5014afef1f20..3c6c9f7ed314 100644 --- a/packages/cli/src/commands/InstallPlaywrightBrowsersCommand.ts +++ b/packages/cli/src/commands/InstallPlaywrightBrowsersCommand.ts @@ -20,7 +20,8 @@ export class InstallPlaywrightBrowsersCommand implements CommandModule; @@ -33,7 +34,11 @@ export class InstallPlaywrightBrowsersCommand implements CommandModule [options]') - .example('$0 run --no-purge', 'Runs the project in current working directory and disables automatic purging of default storages') + .example( + '$0 run --no-purge', + 'Runs the project in current working directory and disables automatic purging of default storages', + ) .alias('v', 'version') .alias('h', 'help') .command(new CreateProjectCommand()) @@ -42,7 +42,7 @@ const cli = yargs.scriptName('crawlee') .strict(); void (async () => { - const args = await cli.parse(process.argv.slice(2)) as { _: string[] }; + const args = (await cli.parse(process.argv.slice(2))) as { _: string[] }; if (args._.length === 0) { yargs.showHelp(); diff --git a/packages/cli/tsconfig.build.json b/packages/cli/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/cli/tsconfig.build.json +++ b/packages/cli/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/core/src/autoscaling/autoscaled_pool.ts b/packages/core/src/autoscaling/autoscaled_pool.ts index 77702e3e7393..602cd772fb62 100644 --- a/packages/core/src/autoscaling/autoscaled_pool.ts +++ b/packages/core/src/autoscaling/autoscaled_pool.ts @@ -214,25 +214,28 @@ export class AutoscaledPool { options: AutoscaledPoolOptions, private readonly config = Configuration.getGlobalConfig(), ) { - ow(options, ow.object.exactShape({ - runTaskFunction: ow.function, - isFinishedFunction: ow.function, - isTaskReadyFunction: ow.function, - maxConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), - minConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), - desiredConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), - desiredConcurrencyRatio: ow.optional.number.greaterThan(0).lessThan(1), - scaleUpStepRatio: ow.optional.number.greaterThan(0).lessThan(1), - scaleDownStepRatio: ow.optional.number.greaterThan(0).lessThan(1), - maybeRunIntervalSecs: ow.optional.number.greaterThan(0), - loggingIntervalSecs: ow.any(ow.number.greaterThan(0), ow.nullOrUndefined), - autoscaleIntervalSecs: ow.optional.number.greaterThan(0), - taskTimeoutSecs: ow.optional.number.greaterThanOrEqual(0), - systemStatusOptions: ow.optional.object, - snapshotterOptions: ow.optional.object, - log: ow.optional.object, - maxTasksPerMinute: ow.optional.number.integerOrInfinite.greaterThanOrEqual(1), - })); + ow( + options, + ow.object.exactShape({ + runTaskFunction: ow.function, + isFinishedFunction: ow.function, + isTaskReadyFunction: ow.function, + maxConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), + minConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), + desiredConcurrency: ow.optional.number.integer.greaterThanOrEqual(1), + desiredConcurrencyRatio: ow.optional.number.greaterThan(0).lessThan(1), + scaleUpStepRatio: ow.optional.number.greaterThan(0).lessThan(1), + scaleDownStepRatio: ow.optional.number.greaterThan(0).lessThan(1), + maybeRunIntervalSecs: ow.optional.number.greaterThan(0), + loggingIntervalSecs: ow.any(ow.number.greaterThan(0), ow.nullOrUndefined), + autoscaleIntervalSecs: ow.optional.number.greaterThan(0), + taskTimeoutSecs: ow.optional.number.greaterThanOrEqual(0), + systemStatusOptions: ow.optional.object, + snapshotterOptions: ow.optional.object, + log: ow.optional.object, + maxTasksPerMinute: ow.optional.number.integerOrInfinite.greaterThanOrEqual(1), + }), + ); const { runTaskFunction, @@ -241,7 +244,7 @@ export class AutoscaledPool { maxConcurrency = 200, minConcurrency = 1, desiredConcurrency, - desiredConcurrencyRatio = 0.90, + desiredConcurrencyRatio = 0.9, scaleUpStepRatio = 0.05, scaleDownStepRatio = 0.05, maybeRunIntervalSecs = 0.5, @@ -421,8 +424,10 @@ export class AutoscaledPool { let timeout: NodeJS.Timeout; if (timeoutSecs) { timeout = setTimeout(() => { - const err = new Error('The pool\'s running tasks did not finish' - + `in ${timeoutSecs} secs after pool.pause() invocation.`); + const err = new Error( + "The pool's running tasks did not finish" + + `in ${timeoutSecs} secs after pool.pause() invocation.`, + ); reject(err); }, timeoutSecs); } diff --git a/packages/core/src/autoscaling/snapshotter.ts b/packages/core/src/autoscaling/snapshotter.ts index 13a47c613365..c8ebea0b4ca3 100644 --- a/packages/core/src/autoscaling/snapshotter.ts +++ b/packages/core/src/autoscaling/snapshotter.ts @@ -67,10 +67,27 @@ export interface SnapshotterOptions { config?: Configuration; } -interface MemorySnapshot { createdAt: Date; isOverloaded: boolean; usedBytes?: number } -interface CpuSnapshot { createdAt: Date; isOverloaded: boolean; usedRatio: number; ticks?: { idle: number; total: number } } -interface EventLoopSnapshot { createdAt: Date; isOverloaded: boolean; exceededMillis: number } -interface ClientSnapshot { createdAt: Date; isOverloaded: boolean; rateLimitErrorCount: number } +interface MemorySnapshot { + createdAt: Date; + isOverloaded: boolean; + usedBytes?: number; +} +interface CpuSnapshot { + createdAt: Date; + isOverloaded: boolean; + usedRatio: number; + ticks?: { idle: number; total: number }; +} +interface EventLoopSnapshot { + createdAt: Date; + isOverloaded: boolean; + exceededMillis: number; +} +interface ClientSnapshot { + createdAt: Date; + isOverloaded: boolean; + rateLimitErrorCount: number; +} /** * Creates snapshots of system resources at given intervals and marks the resource @@ -124,17 +141,20 @@ export class Snapshotter { * @param [options] All `Snapshotter` configuration options. */ constructor(options: SnapshotterOptions = {}) { - ow(options, ow.object.exactShape({ - eventLoopSnapshotIntervalSecs: ow.optional.number, - clientSnapshotIntervalSecs: ow.optional.number, - snapshotHistorySecs: ow.optional.number, - maxBlockedMillis: ow.optional.number, - maxUsedMemoryRatio: ow.optional.number, - maxClientErrors: ow.optional.number, - log: ow.optional.object, - client: ow.optional.object, - config: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + eventLoopSnapshotIntervalSecs: ow.optional.number, + clientSnapshotIntervalSecs: ow.optional.number, + snapshotHistorySecs: ow.optional.number, + maxBlockedMillis: ow.optional.number, + maxUsedMemoryRatio: ow.optional.number, + maxClientErrors: ow.optional.number, + log: ow.optional.object, + client: ow.optional.object, + config: ow.optional.object, + }), + ); const { eventLoopSnapshotIntervalSecs = 0.5, @@ -176,12 +196,17 @@ export class Snapshotter { } else { const { totalBytes } = await this._getMemoryInfo(); this.maxMemoryBytes = Math.ceil(totalBytes * this.config.get('availableMemoryRatio')!); - this.log.debug(`Setting max memory of this run to ${Math.round(this.maxMemoryBytes / 1024 / 1024)} MB. ` - + 'Use the CRAWLEE_MEMORY_MBYTES or CRAWLEE_AVAILABLE_MEMORY_RATIO environment variable to override it.'); + this.log.debug( + `Setting max memory of this run to ${Math.round(this.maxMemoryBytes / 1024 / 1024)} MB. ` + + 'Use the CRAWLEE_MEMORY_MBYTES or CRAWLEE_AVAILABLE_MEMORY_RATIO environment variable to override it.', + ); } // Start snapshotting. - this.eventLoopInterval = betterSetInterval(this._snapshotEventLoop.bind(this), this.eventLoopSnapshotIntervalMillis); + this.eventLoopInterval = betterSetInterval( + this._snapshotEventLoop.bind(this), + this.eventLoopSnapshotIntervalMillis, + ); this.clientInterval = betterSetInterval(this._snapshotClient.bind(this), this.clientSnapshotIntervalMillis); this.events.on(EventType.SYSTEM_INFO, this._snapshotCpu); this.events.on(EventType.SYSTEM_INFO, this._snapshotMemory); @@ -278,7 +303,11 @@ export class Snapshotter { protected _memoryOverloadWarning(systemInfo: SystemInfo) { const { memCurrentBytes } = systemInfo; const createdAt = systemInfo.createdAt ? new Date(systemInfo.createdAt) : new Date(); - if (this.lastLoggedCriticalMemoryOverloadAt && +createdAt < +this.lastLoggedCriticalMemoryOverloadAt + CRITICAL_OVERLOAD_RATE_LIMIT_MILLIS) return; + if ( + this.lastLoggedCriticalMemoryOverloadAt && + +createdAt < +this.lastLoggedCriticalMemoryOverloadAt + CRITICAL_OVERLOAD_RATE_LIMIT_MILLIS + ) + return; const maxDesiredMemoryBytes = this.maxUsedMemoryRatio * this.maxMemoryBytes!; const reserveMemory = this.maxMemoryBytes! * (1 - this.maxUsedMemoryRatio) * RESERVE_MEMORY_RATIO; @@ -287,9 +316,13 @@ export class Snapshotter { if (isCriticalOverload) { const usedPercentage = Math.round((memCurrentBytes! / this.maxMemoryBytes!) * 100); - const toMb = (bytes: number) => Math.round(bytes / (1024 ** 2)); - this.log.warning('Memory is critically overloaded. ' - + `Using ${toMb(memCurrentBytes!)} MB of ${toMb(this.maxMemoryBytes!)} MB (${usedPercentage}%). Consider increasing available memory.`); + const toMb = (bytes: number) => Math.round(bytes / 1024 ** 2); + this.log.warning( + 'Memory is critically overloaded. ' + + `Using ${toMb(memCurrentBytes!)} MB of ${toMb( + this.maxMemoryBytes!, + )} MB (${usedPercentage}%). Consider increasing available memory.`, + ); this.lastLoggedCriticalMemoryOverloadAt = createdAt; } } @@ -371,7 +404,10 @@ export class Snapshotter { * Removes snapshots that are older than the snapshotHistorySecs option * from the array (destructively - in place). */ - protected _pruneSnapshots(snapshots: MemorySnapshot[] | CpuSnapshot[] | EventLoopSnapshot[] | ClientSnapshot[], now: Date) { + protected _pruneSnapshots( + snapshots: MemorySnapshot[] | CpuSnapshot[] | EventLoopSnapshot[] | ClientSnapshot[], + now: Date, + ) { let oldCount = 0; for (let i = 0; i < snapshots.length; i++) { const { createdAt } = snapshots[i]; diff --git a/packages/core/src/autoscaling/system_status.ts b/packages/core/src/autoscaling/system_status.ts index bff2a1252cfd..911e5a5909ee 100644 --- a/packages/core/src/autoscaling/system_status.ts +++ b/packages/core/src/autoscaling/system_status.ts @@ -126,15 +126,18 @@ export class SystemStatus { private readonly snapshotter: Snapshotter; constructor(options: SystemStatusOptions = {}) { - ow(options, ow.object.exactShape({ - currentHistorySecs: ow.optional.number, - maxMemoryOverloadedRatio: ow.optional.number, - maxEventLoopOverloadedRatio: ow.optional.number, - maxCpuOverloadedRatio: ow.optional.number, - maxClientOverloadedRatio: ow.optional.number, - snapshotter: ow.optional.object, - config: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + currentHistorySecs: ow.optional.number, + maxMemoryOverloadedRatio: ow.optional.number, + maxEventLoopOverloadedRatio: ow.optional.number, + maxCpuOverloadedRatio: ow.optional.number, + maxClientOverloadedRatio: ow.optional.number, + snapshotter: ow.optional.object, + config: ow.optional.object, + }), + ); const { currentHistorySecs = 5, @@ -203,7 +206,11 @@ export class SystemStatus { const cpuInfo = this._isCpuOverloaded(sampleDurationMillis); const clientInfo = this._isClientOverloaded(sampleDurationMillis); return { - isSystemIdle: !memInfo.isOverloaded && !eventLoopInfo.isOverloaded && !cpuInfo.isOverloaded && !clientInfo.isOverloaded, + isSystemIdle: + !memInfo.isOverloaded && + !eventLoopInfo.isOverloaded && + !cpuInfo.isOverloaded && + !clientInfo.isOverloaded, memInfo, eventLoopInfo, cpuInfo, @@ -251,7 +258,10 @@ export class SystemStatus { * Returns an object with sample information and an isOverloaded property * set to true if at least the ratio of snapshots in the sample are overloaded. */ - protected _isSampleOverloaded(sample: T[], ratio: number): ClientInfo { + protected _isSampleOverloaded( + sample: T[], + ratio: number, + ): ClientInfo { if (sample.length === 0) { return { isOverloaded: false, diff --git a/packages/core/src/configuration.ts b/packages/core/src/configuration.ts index 8493cbd148de..79778003c9d7 100644 --- a/packages/core/src/configuration.ts +++ b/packages/core/src/configuration.ts @@ -295,7 +295,9 @@ export class Configuration { const logLevel = this.get('logLevel'); if (logLevel) { - const level = Number.isFinite(+logLevel) ? +logLevel : LogLevel[String(logLevel).toUpperCase() as unknown as LogLevel]; + const level = Number.isFinite(+logLevel) + ? +logLevel + : LogLevel[String(logLevel).toUpperCase() as unknown as LogLevel]; log.setLevel(level as LogLevel); } } diff --git a/packages/core/src/cookie_utils.ts b/packages/core/src/cookie_utils.ts index 74c8b7afc0bc..758e8aa3a3f2 100644 --- a/packages/core/src/cookie_utils.ts +++ b/packages/core/src/cookie_utils.ts @@ -9,7 +9,9 @@ import { CookieParseError } from './session_pool/errors'; /** * @internal */ -export function getCookiesFromResponse(response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary }): Cookie[] { +export function getCookiesFromResponse( + response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary }, +): Cookie[] { const headers = typeof response.headers === 'function' ? response.headers() : response.headers; const cookieHeader = headers['set-cookie'] || ''; @@ -29,7 +31,7 @@ export function getCookiesFromResponse(response: IncomingMessage | BrowserLikeRe * @internal */ export function getDefaultCookieExpirationDate(maxAgeSecs: number) { - return new Date(Date.now() + (maxAgeSecs * 1000)); + return new Date(Date.now() + maxAgeSecs * 1000); } /** @@ -59,10 +61,13 @@ export function toughCookieToBrowserPoolCookie(toughCookie: Cookie): CookieObjec */ export function browserPoolCookieToToughCookie(cookieObject: CookieObject, maxAgeSecs: number) { const isExpiresValid = cookieObject.expires && typeof cookieObject.expires === 'number' && cookieObject.expires > 0; - const expires = isExpiresValid ? new Date(cookieObject.expires! * 1000) : getDefaultCookieExpirationDate(maxAgeSecs); - const domain = typeof cookieObject.domain === 'string' && cookieObject.domain.startsWith('.') - ? cookieObject.domain.slice(1) - : cookieObject.domain; + const expires = isExpiresValid + ? new Date(cookieObject.expires! * 1000) + : getDefaultCookieExpirationDate(maxAgeSecs); + const domain = + typeof cookieObject.domain === 'string' && cookieObject.domain.startsWith('.') + ? cookieObject.domain.slice(1) + : cookieObject.domain; return new Cookie({ key: cookieObject.name, @@ -115,7 +120,9 @@ export function mergeCookies(url: string, sourceCookies: string[]): string { }); if (similarKeyCookie) { - log.deprecated(`Found cookies with similar name during cookie merging: '${cookie.key}' and '${similarKeyCookie.key}'`); + log.deprecated( + `Found cookies with similar name during cookie merging: '${cookie.key}' and '${similarKeyCookie.key}'`, + ); } jar.setCookieSync(cookie, url); diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index deec11441c3e..38597c2f525b 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -14,7 +14,8 @@ import { KeyValueStore } from '../storages'; // we need `Record` here, otherwise `Omit` is resolved badly // eslint-disable-next-line -export interface RestrictedCrawlingContext extends Record { +export interface RestrictedCrawlingContext + extends Record { /** * The original {@apilink Request} object. */ @@ -74,7 +75,9 @@ export interface RestrictedCrawlingContext Promise>; + getKeyValueStore: ( + idOrName?: string, + ) => Promise>; /** * A preconfigured logger for the request handler. @@ -82,7 +85,8 @@ export interface RestrictedCrawlingContext extends RestrictedCrawlingContext { +export interface CrawlingContext + extends RestrictedCrawlingContext { id: string; session?: Session; @@ -120,7 +124,7 @@ export interface CrawlingContext> & Pick + options?: ReadonlyDeep> & Pick, ): Promise; /** @@ -154,12 +158,16 @@ export interface CrawlingContext> = {}; + private _keyValueStoreChanges: Record> = + {}; private pushDataCalls: Parameters[] = []; private addRequestsCalls: Parameters[] = []; private enqueueLinksCalls: Parameters[] = []; - constructor(private config: Configuration, private crawleeStateKey: string) {} + constructor( + private config: Configuration, + private crawleeStateKey: string, + ) {} /** * A record of calls to {@apilink RestrictedCrawlingContext.pushData}, {@apilink RestrictedCrawlingContext.addRequests}, {@apilink RestrictedCrawlingContext.enqueueLinks} made by a request handler. @@ -169,13 +177,19 @@ export class RequestHandlerResult { addRequests: Parameters[]; enqueueLinks: Parameters[]; }> { - return { pushData: this.pushDataCalls, addRequests: this.addRequestsCalls, enqueueLinks: this.enqueueLinksCalls }; + return { + pushData: this.pushDataCalls, + addRequests: this.addRequestsCalls, + enqueueLinks: this.enqueueLinksCalls, + }; } /** * A record of changes made to key-value stores by a request handler. */ - get keyValueStoreChanges(): ReadonlyDeep>> { + get keyValueStoreChanges(): ReadonlyDeep< + Record> + > { return this._keyValueStoreChanges; } @@ -183,14 +197,16 @@ export class RequestHandlerResult { * Items added to datasets by a request handler. */ get datasetItems(): ReadonlyDeep<{ item: Dictionary; datasetIdOrName?: string }[]> { - return this.pushDataCalls.flatMap(([data, datasetIdOrName]) => (Array.isArray(data) ? data : [data]).map((item) => ({ item, datasetIdOrName }))); + return this.pushDataCalls.flatMap(([data, datasetIdOrName]) => + (Array.isArray(data) ? data : [data]).map((item) => ({ item, datasetIdOrName })), + ); } /** * URLs enqueued to the request queue by a request handler, either via {@apilink RestrictedCrawlingContext.addRequests} or {@apilink RestrictedCrawlingContext.enqueueLinks} */ get enqueuedUrls(): ReadonlyDeep<{ url: string; label?: string }[]> { - const result: {url: string; label? : string}[] = []; + const result: { url: string; label?: string }[] = []; for (const [options] of this.enqueueLinksCalls) { result.push(...(options?.urls?.map((url) => ({ url, label: options?.label })) ?? [])); @@ -198,7 +214,11 @@ export class RequestHandlerResult { for (const [requests] of this.addRequestsCalls) { for (const request of requests) { - if (typeof request === 'object' && (!('requestsFromUrl' in request) || request.requestsFromUrl !== undefined) && request.url !== undefined) { + if ( + typeof request === 'object' && + (!('requestsFromUrl' in request) || request.requestsFromUrl !== undefined) && + request.url !== undefined + ) { result.push({ url: request.url, label: request.label }); } else if (typeof request === 'string') { result.push({ url: request }); @@ -212,12 +232,16 @@ export class RequestHandlerResult { /** * URL lists enqueued to the request queue by a request handler via {@apilink RestrictedCrawlingContext.addRequests} using the `requestsFromUrl` option. */ - get enqueuedUrlLists(): ReadonlyDeep<{ listUrl: string; label? : string }[]> { - const result: {listUrl: string; label? : string}[] = []; + get enqueuedUrlLists(): ReadonlyDeep<{ listUrl: string; label?: string }[]> { + const result: { listUrl: string; label?: string }[] = []; for (const [requests] of this.addRequestsCalls) { for (const request of requests) { - if (typeof request === 'object' && 'requestsFromUrl' in request && request.requestsFromUrl !== undefined) { + if ( + typeof request === 'object' && + 'requestsFromUrl' in request && + request.requestsFromUrl !== undefined + ) { result.push({ listUrl: request.requestsFromUrl, label: request.label }); } } @@ -249,11 +273,11 @@ export class RequestHandlerResult { return { id: this.idOrDefault(idOrName), name: idOrName, - getValue: async (key) => this.getKeyValueStoreChangedValue(idOrName, key) ?? await store.getValue(key), + getValue: async (key) => this.getKeyValueStoreChangedValue(idOrName, key) ?? (await store.getValue(key)), getAutoSavedValue: async (key: string, defaultValue: T = {} as T) => { let value = this.getKeyValueStoreChangedValue(idOrName, key); if (value === null) { - value = await store.getValue(key) ?? defaultValue; + value = (await store.getValue(key)) ?? defaultValue; this.setKeyValueStoreChangedValue(idOrName, key, value); } @@ -273,7 +297,12 @@ export class RequestHandlerResult { return this.keyValueStoreChanges[id][key]?.changedValue ?? null; }; - private setKeyValueStoreChangedValue = (idOrName: string | undefined, key: string, changedValue: unknown, options?: RecordOptions) => { + private setKeyValueStoreChangedValue = ( + idOrName: string | undefined, + key: string, + changedValue: unknown, + options?: RecordOptions, + ) => { const id = this.idOrDefault(idOrName); this._keyValueStoreChanges[id] ??= {}; this._keyValueStoreChanges[id][key] = { changedValue, options }; diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index 345b2859dc12..2542fdf82c3a 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -109,13 +109,16 @@ export class Statistics { * @internal */ constructor(options: StatisticsOptions = {}) { - ow(options, ow.object.exactShape({ - logIntervalSecs: ow.optional.number, - logMessage: ow.optional.string, - keyValueStore: ow.optional.object, - config: ow.optional.object, - persistenceOptions: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + logIntervalSecs: ow.optional.number, + logMessage: ow.optional.string, + keyValueStore: ow.optional.object, + config: ow.optional.object, + persistenceOptions: ow.optional.object, + }), + ); const { logIntervalSecs = 60, @@ -222,8 +225,10 @@ export class Statistics { this.state.requestsFinished++; this.state.requestTotalFinishedDurationMillis += jobDurationMillis; this._saveRetryCountForJob(job); - if (jobDurationMillis < this.state.requestMinDurationMillis) this.state.requestMinDurationMillis = jobDurationMillis; - if (jobDurationMillis > this.state.requestMaxDurationMillis) this.state.requestMaxDurationMillis = jobDurationMillis; + if (jobDurationMillis < this.state.requestMinDurationMillis) + this.state.requestMinDurationMillis = jobDurationMillis; + if (jobDurationMillis > this.state.requestMaxDurationMillis) + this.state.requestMaxDurationMillis = jobDurationMillis; this.requestsInProgress.delete(id); } @@ -255,7 +260,8 @@ export class Statistics { return { requestAvgFailedDurationMillis: Math.round(requestTotalFailedDurationMillis / requestsFailed) || Infinity, - requestAvgFinishedDurationMillis: Math.round(requestTotalFinishedDurationMillis / requestsFinished) || Infinity, + requestAvgFinishedDurationMillis: + Math.round(requestTotalFinishedDurationMillis / requestsFinished) || Infinity, requestsFinishedPerMinute: Math.round(requestsFinished / totalMinutes) || 0, requestsFailedPerMinute: Math.floor(requestsFailed / totalMinutes) || 0, requestTotalDurationMillis: requestTotalFinishedDurationMillis + requestTotalFailedDurationMillis, @@ -392,7 +398,9 @@ export class Statistics { const result = { ...this.state, crawlerLastStartTimestamp: this.instanceStart, - crawlerFinishedAt: this.state.crawlerFinishedAt ? new Date(this.state.crawlerFinishedAt).toISOString() : null, + crawlerFinishedAt: this.state.crawlerFinishedAt + ? new Date(this.state.crawlerFinishedAt).toISOString() + : null, crawlerStartedAt: this.state.crawlerStartedAt ? new Date(this.state.crawlerStartedAt).toISOString() : null, requestRetryHistogram: this.requestRetryHistogram, statsId: this.id, diff --git a/packages/core/src/enqueue_links/enqueue_links.ts b/packages/core/src/enqueue_links/enqueue_links.ts index 7ed778f53a56..74eba06b8c11 100644 --- a/packages/core/src/enqueue_links/enqueue_links.ts +++ b/packages/core/src/enqueue_links/enqueue_links.ts @@ -232,58 +232,44 @@ export enum EnqueueStrategy { * @param options All `enqueueLinks()` parameters are passed via an options object. * @returns Promise that resolves to {@apilink BatchAddRequestsResult} object. */ -export async function enqueueLinks(options: SetRequired): Promise { +export async function enqueueLinks( + options: SetRequired, +): Promise { if (!options || Object.keys(options).length === 0) { - throw new RangeError([ - // eslint-disable-next-line max-len - 'enqueueLinks() was called without the required options. You can only do that when you use the `crawlingContext.enqueueLinks()` method in request handlers.', - 'Check out our guide on how to use enqueueLinks() here: https://crawlee.dev/docs/examples/crawl-relative-links', - ].join('\n')); + throw new RangeError( + [ + // eslint-disable-next-line max-len + 'enqueueLinks() was called without the required options. You can only do that when you use the `crawlingContext.enqueueLinks()` method in request handlers.', + 'Check out our guide on how to use enqueueLinks() here: https://crawlee.dev/docs/examples/crawl-relative-links', + ].join('\n'), + ); } - ow(options, ow.object.exactShape({ - urls: ow.array.ofType(ow.string), - requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), - forefront: ow.optional.boolean, - skipNavigation: ow.optional.boolean, - limit: ow.optional.number, - selector: ow.optional.string, - baseUrl: ow.optional.string, - userData: ow.optional.object, - label: ow.optional.string, - pseudoUrls: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('purl'), - )), - globs: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('glob'), - )), - exclude: ow.optional.array.ofType(ow.any( - ow.string, - ow.regExp, - ow.object.hasKeys('glob'), - ow.object.hasKeys('regexp'), - )), - regexps: ow.optional.array.ofType(ow.any( - ow.regExp, - ow.object.hasKeys('regexp'), - )), - transformRequestFunction: ow.optional.function, - strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)), - })); - - const { - requestQueue, - limit, - urls, - pseudoUrls, - exclude, - globs, - regexps, - transformRequestFunction, - forefront, - } = options; + ow( + options, + ow.object.exactShape({ + urls: ow.array.ofType(ow.string), + requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), + forefront: ow.optional.boolean, + skipNavigation: ow.optional.boolean, + limit: ow.optional.number, + selector: ow.optional.string, + baseUrl: ow.optional.string, + userData: ow.optional.object, + label: ow.optional.string, + pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), + globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), + exclude: ow.optional.array.ofType( + ow.any(ow.string, ow.regExp, ow.object.hasKeys('glob'), ow.object.hasKeys('regexp')), + ), + regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), + transformRequestFunction: ow.optional.function, + strategy: ow.optional.string.oneOf(Object.values(EnqueueStrategy)), + }), + ); + + const { requestQueue, limit, urls, pseudoUrls, exclude, globs, regexps, transformRequestFunction, forefront } = + options; const urlExcludePatternObjects: UrlPatternObject[] = []; const urlPatternObjects: UrlPatternObject[] = []; @@ -360,7 +346,9 @@ export async function enqueueLinks(options: SetRequired transformRequestFunction(request)).filter((r) => !!r) as RequestOptions[]; + requestOptions = requestOptions + .map((request) => transformRequestFunction(request)) + .filter((r) => !!r) as RequestOptions[]; } function createFilteredRequests() { @@ -370,7 +358,12 @@ export async function enqueueLinks(options: SetRequired; -export type PseudoUrlObject = { purl: string } & Pick; +export type PseudoUrlObject = { purl: string } & Pick< + RequestOptions, + 'method' | 'payload' | 'label' | 'userData' | 'headers' +>; export type PseudoUrlInput = string | PseudoUrlObject; -export type GlobObject = { glob: string } & Pick; +export type GlobObject = { glob: string } & Pick< + RequestOptions, + 'method' | 'payload' | 'label' | 'userData' | 'headers' +>; export type GlobInput = string | GlobObject; -export type RegExpObject = { regexp: RegExp } & Pick; +export type RegExpObject = { regexp: RegExp } & Pick< + RequestOptions, + 'method' | 'payload' | 'label' | 'userData' | 'headers' +>; export type RegExpInput = RegExp | RegExpObject; /** * @ignore */ -export function updateEnqueueLinksPatternCache(item: GlobInput | RegExpInput | PseudoUrlInput, pattern: RegExpObject | GlobObject): void { +export function updateEnqueueLinksPatternCache( + item: GlobInput | RegExpInput | PseudoUrlInput, + pattern: RegExpObject | GlobObject, +): void { enqueueLinksPatternCache.set(item, pattern); if (enqueueLinksPatternCache.size > MAX_ENQUEUE_LINKS_CACHE_SIZE) { const key = enqueueLinksPatternCache.keys().next().value; @@ -95,7 +107,7 @@ export function constructGlobObjectsFromGlobs(globs: Readonly): Glo return false; }) .map((item) => { - // Get glob object from cache. + // Get glob object from cache. let globObject = enqueueLinksPatternCache.get(item); if (globObject) return globObject; @@ -117,7 +129,8 @@ export function constructGlobObjectsFromGlobs(globs: Readonly): Glo */ export function validateGlobPattern(glob: string): string { const globTrimmed = glob.trim(); - if (globTrimmed.length === 0) throw new Error(`Cannot parse Glob pattern '${globTrimmed}': it must be an non-empty string`); + if (globTrimmed.length === 0) + throw new Error(`Cannot parse Glob pattern '${globTrimmed}': it must be an non-empty string`); return globTrimmed; } @@ -159,8 +172,8 @@ export function createRequests( return !excludePatternObjects.some((excludePatternObject) => { const { regexp, glob } = excludePatternObject; return ( - (regexp && url.match(regexp)) || // eslint-disable-line - (glob && minimatch(url, glob, { nocase: true })) + (regexp && url.match(regexp)) || // eslint-disable-line + (glob && minimatch(url, glob, { nocase: true })) ); }); }) @@ -175,9 +188,10 @@ export function createRequests( (regexp && url.match(regexp)) || // eslint-disable-line (glob && minimatch(url, glob, { nocase: true })) ) { - const request = typeof opts === 'string' - ? { url: opts, ...requestRegExpOptions, enqueueStrategy: strategy } - : { ...opts, ...requestRegExpOptions, enqueueStrategy: strategy }; + const request = + typeof opts === 'string' + ? { url: opts, ...requestRegExpOptions, enqueueStrategy: strategy } + : { ...opts, ...requestRegExpOptions, enqueueStrategy: strategy }; return new Request(request); } @@ -222,10 +236,10 @@ export function createRequestOptions( options: Pick = {}, ): RequestOptions[] { return sources - .map( - (src) => ( - typeof src === 'string' ? { url: src, enqueueStrategy: options.strategy } : { ...src, enqueueStrategy: options.strategy } as RequestOptions - ), + .map((src) => + typeof src === 'string' + ? { url: src, enqueueStrategy: options.strategy } + : ({ ...src, enqueueStrategy: options.strategy } as RequestOptions), ) .filter(({ url }) => { try { diff --git a/packages/core/src/events/local_event_manager.ts b/packages/core/src/events/local_event_manager.ts index a131d8abe8a6..da7b5ed0233f 100644 --- a/packages/core/src/events/local_event_manager.ts +++ b/packages/core/src/events/local_event_manager.ts @@ -51,13 +51,16 @@ export class LocalEventManager extends EventManager { private getCurrentCpuTicks() { const cpus = os.cpus(); - return cpus.reduce((acc, cpu) => { - const cpuTimes = Object.values(cpu.times); - return { - idle: acc.idle + cpu.times.idle, - total: acc.total + cpuTimes.reduce((sum, num) => sum + num), - }; - }, { idle: 0, total: 0 }); + return cpus.reduce( + (acc, cpu) => { + const cpuTimes = Object.values(cpu.times); + return { + idle: acc.idle + cpu.times.idle, + total: acc.total + cpuTimes.reduce((sum, num) => sum + num), + }; + }, + { idle: 0, total: 0 }, + ); } /** @@ -67,7 +70,7 @@ export class LocalEventManager extends EventManager { return { createdAt: new Date(), ...this.createCpuInfo(options), - ...await this.createMemoryInfo(), + ...(await this.createMemoryInfo()), } as SystemInfo; } @@ -75,7 +78,7 @@ export class LocalEventManager extends EventManager { const ticks = this.getCurrentCpuTicks(); const idleTicksDelta = ticks.idle - this.previousTicks!.idle; const totalTicksDelta = ticks.total - this.previousTicks!.total; - const usedCpuRatio = totalTicksDelta ? 1 - (idleTicksDelta / totalTicksDelta) : 0; + const usedCpuRatio = totalTicksDelta ? 1 - idleTicksDelta / totalTicksDelta : 0; Object.assign(this.previousTicks, ticks); return { diff --git a/packages/core/src/log.ts b/packages/core/src/log.ts index 2ea4ea794815..d06336beaeef 100644 --- a/packages/core/src/log.ts +++ b/packages/core/src/log.ts @@ -1,11 +1,3 @@ import log, { Log, LoggerOptions, LogLevel, Logger, LoggerJson, LoggerText } from '@apify/log'; -export { - log, - Log, - LoggerOptions, - LogLevel, - Logger, - LoggerJson, - LoggerText, -}; +export { log, Log, LoggerOptions, LogLevel, Logger, LoggerJson, LoggerText }; diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 8bc03d6e58a0..f5c56c6073b2 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -142,10 +142,13 @@ export class ProxyConfiguration { */ constructor(options: ProxyConfigurationOptions = {}) { const { validateRequired, ...rest } = options as Dictionary; - ow(rest, ow.object.exactShape({ - proxyUrls: ow.optional.array.nonEmpty.ofType(ow.string.url), - newUrlFunction: ow.optional.function, - })); + ow( + rest, + ow.object.exactShape({ + proxyUrls: ow.optional.array.nonEmpty.ofType(ow.string.url), + newUrlFunction: ow.optional.function, + }), + ); const { proxyUrls, newUrlFunction } = options; @@ -247,15 +250,17 @@ export class ProxyConfiguration { } } - protected _throwNewUrlFunctionInvalid(err: Error) : never { + protected _throwNewUrlFunctionInvalid(err: Error): never { throw new Error(`The provided newUrlFunction did not return a valid URL.\nCause: ${err.message}`); } - protected _throwCannotCombineCustomMethods() : never { - throw new Error('Cannot combine custom proxies "options.proxyUrls" with custom generating function "options.newUrlFunction".'); + protected _throwCannotCombineCustomMethods(): never { + throw new Error( + 'Cannot combine custom proxies "options.proxyUrls" with custom generating function "options.newUrlFunction".', + ); } - protected _throwNoOptionsProvided() : never { + protected _throwNoOptionsProvided(): never { throw new Error('One of "options.proxyUrls" or "options.newUrlFunction" needs to be provided.'); } } diff --git a/packages/core/src/request.ts b/packages/core/src/request.ts index 6718c6baea29..b6e994e41978 100644 --- a/packages/core/src/request.ts +++ b/packages/core/src/request.ts @@ -182,9 +182,7 @@ export class Request { handledAt?: string | Date; }; - let { - method = 'GET', - } = options; + let { method = 'GET' } = options; method = method.toUpperCase() as AllowedHttpMethods; @@ -193,7 +191,8 @@ export class Request { this.id = id; this.url = url; this.loadedUrl = loadedUrl; - this.uniqueKey = uniqueKey || this._computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }); + this.uniqueKey = + uniqueKey || this._computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }); this.method = method; this.payload = payload; this.noRetry = noRetry; @@ -201,7 +200,7 @@ export class Request { this.sessionRotationCount = sessionRotationCount; this.errorMessages = [...errorMessages]; this.headers = { ...headers }; - this.handledAt = handledAt as unknown instanceof Date ? (handledAt as Date).toISOString() : handledAt!; + this.handledAt = (handledAt as unknown) instanceof Date ? (handledAt as Date).toISOString() : handledAt!; if (label) { userData.label = label; @@ -225,10 +224,10 @@ export class Request { toJSON: { value: () => { if (Object.keys(this._userData.__crawlee).length > 0) { - return ({ + return { ...this._userData, __crawlee: this._userData.__crawlee, - }); + }; } return this._userData; @@ -356,8 +355,8 @@ export class Request { } else if (errorOrMessage instanceof Error) { message = omitStack ? errorOrMessage.message - // .stack includes the message - : errorOrMessage.stack; + : // .stack includes the message + errorOrMessage.stack; } else if (Reflect.has(Object(errorOrMessage), 'message')) { message = Reflect.get(Object(errorOrMessage), 'message'); } else if ((errorOrMessage as string).toString() !== '[object Object]') { @@ -378,15 +377,23 @@ export class Request { this.errorMessages.push(message); } - protected _computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey }: ComputeUniqueKeyOptions) { + protected _computeUniqueKey({ + url, + method, + payload, + keepUrlFragment, + useExtendedUniqueKey, + }: ComputeUniqueKeyOptions) { const normalizedMethod = method.toUpperCase(); const normalizedUrl = normalizeUrl(url, keepUrlFragment) || url; // It returns null when url is invalid, causing weird errors. if (!useExtendedUniqueKey) { if (normalizedMethod !== 'GET' && payload) { // Using log.deprecated to log only once. We should add log.once or some such. - log.deprecated(`We've encountered a ${normalizedMethod} Request with a payload. ` - + 'This is fine. Just letting you know that if your requests point to the same URL ' - + 'and differ only in method and payload, you should see the "useExtendedUniqueKey" option of Request constructor.'); + log.deprecated( + `We've encountered a ${normalizedMethod} Request with a payload. ` + + 'This is fine. Just letting you know that if your requests point to the same URL ' + + 'and differ only in method and payload, you should see the "useExtendedUniqueKey" option of Request constructor.', + ); } return normalizedUrl; } @@ -395,12 +402,7 @@ export class Request { } protected _hashPayload(payload: BinaryLike): string { - return crypto - .createHash('sha256') - .update(payload) - .digest('base64') - .replace(/[+/=]/g, '') - .substring(0, 8); + return crypto.createHash('sha256').update(payload).digest('base64').replace(/[+/=]/g, '').substring(0, 8); } } @@ -408,7 +410,6 @@ export class Request { * Specifies required and optional fields for constructing a {@apilink Request}. */ export interface RequestOptions { - /** URL of the web page to crawl. It must be a non-empty string. */ url: string; diff --git a/packages/core/src/router.ts b/packages/core/src/router.ts index 6bcab6f00865..67aae370e87d 100644 --- a/packages/core/src/router.ts +++ b/packages/core/src/router.ts @@ -15,7 +15,7 @@ export type GetUserDataFromRequest = T extends Request ? Y : never; export type RouterRoutes = { [label in string | symbol]: (ctx: Omit & { request: Request }) => Awaitable; -} +}; /** * Simple router that works based on request labels. This instance can then serve as a `requestHandler` of your crawler. @@ -134,9 +134,9 @@ export class Router { } throw new MissingRouteError( - `Route not found for label '${String(label)}'.` - + ' You must set up a route for this label or a default route.' - + ' Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.', + `Route not found for label '${String(label)}'.` + + ' You must set up a route for this label or a default route.' + + ' Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.', ); } @@ -145,9 +145,10 @@ export class Router { */ private validate(label: string | symbol) { if (this.routes.has(label)) { - const message = label === defaultRoute - ? `Default route is already defined!` - : `Route for label '${String(label)}' is already defined!`; + const message = + label === defaultRoute + ? `Default route is already defined!` + : `Route for label '${String(label)}' is already defined!`; throw new Error(message); } } diff --git a/packages/core/src/serialization.ts b/packages/core/src/serialization.ts index 65821b3438b5..fac7535a67b8 100644 --- a/packages/core/src/serialization.ts +++ b/packages/core/src/serialization.ts @@ -18,7 +18,10 @@ class ArrayToJson extends Readable { private offset = 0; private readonly batchSize: number; - constructor(private data: T[], options: { batchSize?: number } = {}) { + constructor( + private data: T[], + options: { batchSize?: number } = {}, + ) { super({ ...options, autoDestroy: true, @@ -62,11 +65,7 @@ class ArrayToJson extends Readable { export async function serializeArray(data: T[]): Promise { ow(data, ow.array); const { chunks, collector } = createChunkCollector(); - await pipeline( - new ArrayToJson(data), - zlib.createGzip(), - collector, - ); + await pipeline(new ArrayToJson(data), zlib.createGzip(), collector); return Buffer.concat(chunks as Buffer[]); } @@ -83,12 +82,7 @@ export async function serializeArray(data: T[]): Promise { export async function deserializeArray(compressedData: Buffer): Promise { ow(compressedData, ow.buffer); const { chunks, collector } = createChunkCollector({ fromValuesStream: true }); - await pipeline( - Readable.from([compressedData]), - zlib.createGunzip(), - StreamArray.withParser(), - collector, - ); + await pipeline(Readable.from([compressedData]), zlib.createGunzip(), StreamArray.withParser(), collector); return chunks as T[]; } @@ -118,7 +112,9 @@ export function createDeserialize(compressedData: Buffer): Readable { return destination; } -function createChunkCollector(options: { fromValuesStream?: boolean } = {}): { chunks: T[]; collector: Writable } { +function createChunkCollector( + options: { fromValuesStream?: boolean } = {}, +): { chunks: T[]; collector: Writable } { const { fromValuesStream = false } = options; const chunks: T[] = []; const collector = new Writable({ diff --git a/packages/core/src/session_pool/session.ts b/packages/core/src/session_pool/session.ts index e242d0269ea6..e0226814c296 100644 --- a/packages/core/src/session_pool/session.ts +++ b/packages/core/src/session_pool/session.ts @@ -9,7 +9,12 @@ import type { Cookie } from 'tough-cookie'; import { CookieJar } from 'tough-cookie'; import { EVENT_SESSION_RETIRED } from './events'; -import { browserPoolCookieToToughCookie, getCookiesFromResponse, getDefaultCookieExpirationDate, toughCookieToBrowserPoolCookie } from '../cookie_utils'; +import { + browserPoolCookieToToughCookie, + getCookiesFromResponse, + getDefaultCookieExpirationDate, + toughCookieToBrowserPoolCookie, +} from '../cookie_utils'; import { log as defaultLog } from '../log'; /** @@ -29,7 +34,6 @@ export interface SessionState { } export interface SessionOptions { - /** Id of session used for generating fingerprints. It is used as proxy session name. */ id?: string; @@ -84,7 +88,6 @@ export interface SessionOptions { log?: Log; errorScore?: number; cookieJar?: CookieJar; - } /** @@ -112,21 +115,24 @@ export class Session { * Session configuration. */ constructor(options: SessionOptions) { - ow(options, ow.object.exactShape({ - sessionPool: ow.object.instanceOf(EventEmitter), - id: ow.optional.string, - cookieJar: ow.optional.object, - maxAgeSecs: ow.optional.number, - userData: ow.optional.object, - maxErrorScore: ow.optional.number, - errorScoreDecrement: ow.optional.number, - createdAt: ow.optional.date, - expiresAt: ow.optional.date, - usageCount: ow.optional.number, - errorScore: ow.optional.number, - maxUsageCount: ow.optional.number, - log: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + sessionPool: ow.object.instanceOf(EventEmitter), + id: ow.optional.string, + cookieJar: ow.optional.object, + maxAgeSecs: ow.optional.number, + userData: ow.optional.object, + maxErrorScore: ow.optional.number, + errorScoreDecrement: ow.optional.number, + createdAt: ow.optional.date, + expiresAt: ow.optional.date, + usageCount: ow.optional.number, + errorScore: ow.optional.number, + maxUsageCount: ow.optional.number, + log: ow.optional.object, + }), + ); const { sessionPool, @@ -147,7 +153,7 @@ export class Session { this.log = log.child({ prefix: 'Session' }); - this.cookieJar = cookieJar.setCookie as unknown ? cookieJar : CookieJar.fromJSON(JSON.stringify(cookieJar)); + this.cookieJar = (cookieJar.setCookie as unknown) ? cookieJar : CookieJar.fromJSON(JSON.stringify(cookieJar)); this.id = id; this.maxAgeSecs = maxAgeSecs; this.userData = userData; @@ -283,7 +289,9 @@ export class Session { retireOnBlockedStatusCodes(statusCode: number, additionalBlockedStatusCodes: number[] = []): boolean { // eslint-disable-next-line dot-notation -- accessing private property - const isBlocked = this.sessionPool['blockedStatusCodes'].concat(additionalBlockedStatusCodes).includes(statusCode); + const isBlocked = this.sessionPool['blockedStatusCodes'] + .concat(additionalBlockedStatusCodes) + .includes(statusCode); if (isBlocked) { this.retire(); } @@ -297,7 +305,9 @@ export class Session { * * It then parses and saves the cookies from the `set-cookie` header, if available. */ - setCookiesFromResponse(response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary; url: string }) { + setCookiesFromResponse( + response: IncomingMessage | BrowserLikeResponse | { headers: Dictionary; url: string }, + ) { try { const cookies = getCookiesFromResponse(response).filter((c) => c); this._setCookies(cookies, typeof response.url === 'function' ? response.url() : response.url!); diff --git a/packages/core/src/session_pool/session_pool.ts b/packages/core/src/session_pool/session_pool.ts index 52e860f4a88f..2bc048c53c4e 100644 --- a/packages/core/src/session_pool/session_pool.ts +++ b/packages/core/src/session_pool/session_pool.ts @@ -65,7 +65,7 @@ export interface SessionPoolOptions { /** * Control how and when to persist the state of the session pool. */ - persistenceOptions?: PersistenceOptions; + persistenceOptions?: PersistenceOptions; } /** @@ -154,19 +154,25 @@ export class SessionPool extends EventEmitter { /** * @internal */ - constructor(options: SessionPoolOptions = {}, readonly config = Configuration.getGlobalConfig()) { + constructor( + options: SessionPoolOptions = {}, + readonly config = Configuration.getGlobalConfig(), + ) { super(); - ow(options, ow.object.exactShape({ - maxPoolSize: ow.optional.number, - persistStateKeyValueStoreId: ow.optional.string, - persistStateKey: ow.optional.string, - createSessionFunction: ow.optional.function, - sessionOptions: ow.optional.object, - blockedStatusCodes: ow.optional.array.ofType(ow.number), - log: ow.optional.object, - persistenceOptions: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + maxPoolSize: ow.optional.number, + persistStateKeyValueStoreId: ow.optional.string, + persistStateKey: ow.optional.string, + createSessionFunction: ow.optional.function, + sessionOptions: ow.optional.object, + blockedStatusCodes: ow.optional.array.ofType(ow.number), + log: ow.optional.object, + persistenceOptions: ow.optional.object, + }), + ); const { maxPoolSize = MAX_POOL_SIZE, @@ -235,7 +241,9 @@ export class SessionPool extends EventEmitter { if (!this.persistStateKeyValueStoreId) { // eslint-disable-next-line max-len - this.log.debug(`No 'persistStateKeyValueStoreId' options specified, this session pool's data has been saved in the KeyValueStore with the id: ${this.keyValueStore.id}`); + this.log.debug( + `No 'persistStateKeyValueStoreId' options specified, this session pool's data has been saved in the KeyValueStore with the id: ${this.keyValueStore.id}`, + ); } // in case of migration happened and SessionPool state should be restored from the keyValueStore. @@ -267,9 +275,8 @@ export class SessionPool extends EventEmitter { this._removeRetiredSessions(); } - const newSession = options instanceof Session - ? options - : await this.createSessionFunction(this, { sessionOptions: options }); + const newSession = + options instanceof Session ? options : await this.createSessionFunction(this, { sessionOptions: options }); this.log.debug(`Adding new Session - ${newSession.id}`); this._addSession(newSession); @@ -416,7 +423,10 @@ export class SessionPool extends EventEmitter { * @param [options.sessionOptions] The configuration options for the session being created. * @returns New session. */ - protected _defaultCreateSessionFunction(sessionPool: SessionPool, options: { sessionOptions?: SessionOptions } = {}): Session { + protected _defaultCreateSessionFunction( + sessionPool: SessionPool, + options: { sessionOptions?: SessionOptions } = {}, + ): Session { ow(options, ow.object.exactShape({ sessionOptions: ow.optional.object })); const { sessionOptions = {} } = options; return new Session({ diff --git a/packages/core/src/storages/access_checking.ts b/packages/core/src/storages/access_checking.ts index cb123d60ee6b..eddb4b6618c9 100644 --- a/packages/core/src/storages/access_checking.ts +++ b/packages/core/src/storages/access_checking.ts @@ -2,7 +2,7 @@ import { AsyncLocalStorage } from 'async_hooks'; import type { Awaitable } from '../typedefs'; -const storage = new AsyncLocalStorage<{ checkFunction:() => void }>(); +const storage = new AsyncLocalStorage<{ checkFunction: () => void }>(); /** * Invoke a storage access checker function defined using {@link withCheckedStorageAccess} higher up in the call stack. @@ -15,4 +15,5 @@ export const checkStorageAccess = () => storage.getStore()?.checkFunction(); * @param checkFunction The check function that should be invoked by {@link checkStorageAccess} calls * @param callback The code that should be invoked with the `checkFunction` setting */ -export const withCheckedStorageAccess = async (checkFunction: () => void, callback: () => Awaitable) => storage.run({ checkFunction }, callback); +export const withCheckedStorageAccess = async (checkFunction: () => void, callback: () => Awaitable) => + storage.run({ checkFunction }, callback); diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 689a55cf09e9..6f2e4d00a241 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -67,7 +67,7 @@ export function chunkBySize(items: string[], limitBytes: number): string[] { for (const payload of items) { const bytes = Buffer.byteLength(payload); - if (bytes <= limitBytes && (bytes + 2) > limitBytes) { + if (bytes <= limitBytes && bytes + 2 > limitBytes) { // Handle cases where wrapping with [] would fail, but solo object is fine. chunks.push(payload); lastChunkBytes = bytes; @@ -142,7 +142,8 @@ export interface DatasetDataOptions { export interface DatasetExportOptions extends Omit {} -export interface DatasetIteratorOptions extends Omit { +export interface DatasetIteratorOptions + extends Omit { /** @internal */ offset?: number; @@ -230,7 +231,10 @@ export class Dataset { /** * @internal */ - constructor(options: DatasetOptions, readonly config = Configuration.getGlobalConfig()) { + constructor( + options: DatasetOptions, + readonly config = Configuration.getGlobalConfig(), + ) { this.id = options.id; this.name = options.name; this.client = options.client.dataset(this.id) as DatasetClient; @@ -294,7 +298,9 @@ export class Dataset { } catch (e) { const error = e as Error; if (error.message.includes('Cannot create a string longer than')) { - throw new Error('dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.'); + throw new Error( + 'dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.', + ); } throw e; } @@ -341,10 +347,7 @@ export class Dataset { const items = await this.export(options); if (contentType === 'text/csv') { - const value = stringify([ - Object.keys(items[0]), - ...items.map((item) => Object.values(item)), - ]); + const value = stringify([Object.keys(items[0]), ...items.map((item) => Object.values(item))]); await kvStore.setValue(key, value, { contentType }); return items; } @@ -457,7 +460,8 @@ export class Dataset { checkStorageAccess(); if (!options.offset) options.offset = 0; - if (options.format && options.format !== 'json') throw new Error('Dataset.forEach/map/reduce() support only a "json" format.'); + if (options.format && options.format !== 'json') + throw new Error('Dataset.forEach/map/reduce() support only a "json" format.'); if (!options.limit) options.limit = DATASET_ITERATORS_DEFAULT_LIMIT; const { items, total, limit, offset } = await this.getData(options); @@ -516,12 +520,9 @@ export class Dataset { let currentMemo: T = memo; const wrappedFunc: DatasetConsumer = async (item, index) => { - return Promise - .resolve() + return Promise.resolve() .then(() => { - return !index && currentMemo === undefined - ? item - : iteratee(currentMemo, item, index); + return !index && currentMemo === undefined ? item : iteratee(currentMemo, item, index); }) .then((newMemo) => { currentMemo = newMemo as T; @@ -558,14 +559,20 @@ export class Dataset { * the function returns the default dataset associated with the crawler run. * @param [options] Storage manager options. */ - static async open(datasetIdOrName?: string | null, options: StorageManagerOptions = {}): Promise> { + static async open( + datasetIdOrName?: string | null, + options: StorageManagerOptions = {}, + ): Promise> { checkStorageAccess(); ow(datasetIdOrName, ow.optional.string); - ow(options, ow.object.exactShape({ - config: ow.optional.object.instanceOf(Configuration), - storageClient: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + config: ow.optional.object.instanceOf(Configuration), + storageClient: ow.optional.object, + }), + ); options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); @@ -609,7 +616,9 @@ export class Dataset { /** * Returns {@apilink DatasetContent} object holding the items in the dataset based on the provided parameters. */ - static async getData(options: DatasetDataOptions = {}): Promise> { + static async getData( + options: DatasetDataOptions = {}, + ): Promise> { const dataset = await this.open(); return dataset.getData(options); } @@ -619,41 +628,35 @@ export class Dataset { * User-function used in the `Dataset.forEach()` API. */ export interface DatasetConsumer { - /** * @param item Current {@apilink Dataset} entry being processed. * @param index Position of current {@apilink Dataset} entry. */ (item: Data, index: number): Awaitable; - } /** * User-function used in the `Dataset.map()` API. */ export interface DatasetMapper { - /** * User-function used in the `Dataset.map()` API. * @param item Current {@apilink Dataset} entry being processed. * @param index Position of current {@apilink Dataset} entry. */ (item: Data, index: number): Awaitable; - } /** * User-function used in the `Dataset.reduce()` API. */ export interface DatasetReducer { - /** * @param memo Previous state of the reduction. * @param item Current {@apilink Dataset} entry being processed. * @param index Position of current {@apilink Dataset} entry. */ (memo: T, item: Data, index: number): Awaitable; - } export interface DatasetOptions { diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 5c3077cffb44..a4ed1229983d 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -37,8 +37,10 @@ export const maybeStringify = (value: T, options: { contentType?: string }) = } if (value === undefined) { - throw new Error('The "value" parameter was stringified to JSON and returned undefined. ' - + 'Make sure you\'re not trying to stringify an undefined value.'); + throw new Error( + 'The "value" parameter was stringified to JSON and returned undefined. ' + + "Make sure you're not trying to stringify an undefined value.", + ); } } @@ -113,7 +115,10 @@ export class KeyValueStore { /** * @internal */ - constructor(options: KeyValueStoreOptions, readonly config = Configuration.getGlobalConfig()) { + constructor( + options: KeyValueStoreOptions, + readonly config = Configuration.getGlobalConfig(), + ) { this.id = options.id; this.name = options.name; this.client = options.client.keyValueStore(this.id); @@ -149,7 +154,7 @@ export class KeyValueStore { * or [`Buffer`](https://nodejs.org/api/buffer.html), depending * on the MIME content type of the record. */ - async getValue(key: string): Promise + async getValue(key: string): Promise; /** * Gets a value from the key-value store. * @@ -182,7 +187,7 @@ export class KeyValueStore { * or [`Buffer`](https://nodejs.org/api/buffer.html), depending * on the MIME content type of the record, or the default value if the key is missing from the store. */ - async getValue(key: string, defaultValue: T): Promise + async getValue(key: string, defaultValue: T): Promise; /** * Gets a value from the key-value store. * @@ -221,7 +226,7 @@ export class KeyValueStore { ow(key, ow.string.nonEmpty); const record = await this.client.getRecord(key); - return record?.value as T ?? defaultValue ?? null; + return (record?.value as T) ?? defaultValue ?? null; } /** @@ -321,17 +326,32 @@ export class KeyValueStore { checkStorageAccess(); ow(key, 'key', ow.string.nonEmpty); - ow(key, ow.string.validate((k) => ({ - validator: ow.isValid(k, ow.string.matches(KEY_VALUE_STORE_KEY_REGEX)), - message: 'The "key" argument must be at most 256 characters long and only contain the following characters: a-zA-Z0-9!-_.\'()', - }))); - if (options.contentType - && !(ow.isValid(value, ow.any(ow.string, ow.buffer)) || (ow.isValid(value, ow.object) && typeof (value as Dictionary).pipe === 'function'))) { - throw new ArgumentError('The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', this.setValue); + ow( + key, + ow.string.validate((k) => ({ + validator: ow.isValid(k, ow.string.matches(KEY_VALUE_STORE_KEY_REGEX)), + message: + 'The "key" argument must be at most 256 characters long and only contain the following characters: a-zA-Z0-9!-_.\'()', + })), + ); + if ( + options.contentType && + !( + ow.isValid(value, ow.any(ow.string, ow.buffer)) || + (ow.isValid(value, ow.object) && typeof (value as Dictionary).pipe === 'function') + ) + ) { + throw new ArgumentError( + 'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', + this.setValue, + ); } - ow(options, ow.object.exactShape({ - contentType: ow.optional.string.nonEmpty, - })); + ow( + options, + ow.object.exactShape({ + contentType: ow.optional.string.nonEmpty, + }), + ); // Make copy of options, don't update what user passed. const optionsCopy = { ...options }; @@ -411,12 +431,19 @@ export class KeyValueStore { return this._forEachKey(iteratee, options); } - private async _forEachKey(iteratee: KeyConsumer, options: KeyValueStoreIteratorOptions = {}, index = 0): Promise { + private async _forEachKey( + iteratee: KeyConsumer, + options: KeyValueStoreIteratorOptions = {}, + index = 0, + ): Promise { const { exclusiveStartKey } = options; ow(iteratee, ow.function); - ow(options, ow.object.exactShape({ - exclusiveStartKey: ow.optional.string, - })); + ow( + options, + ow.object.exactShape({ + exclusiveStartKey: ow.optional.string, + }), + ); const response = await this.client.listKeys({ exclusiveStartKey }); const { nextExclusiveStartKey, isTruncated, items } = response; @@ -446,10 +473,13 @@ export class KeyValueStore { checkStorageAccess(); ow(storeIdOrName, ow.optional.any(ow.string, ow.null)); - ow(options, ow.object.exactShape({ - config: ow.optional.object.instanceOf(Configuration), - storageClient: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + config: ow.optional.object.instanceOf(Configuration), + storageClient: ow.optional.object, + }), + ); options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); @@ -489,7 +519,7 @@ export class KeyValueStore { * if the record is missing. * @ignore */ - static async getValue(key: string): Promise + static async getValue(key: string): Promise; /** * Gets a value from the default {@apilink KeyValueStore} associated with the current crawler run. * @@ -518,7 +548,7 @@ export class KeyValueStore { * on the MIME content type of the record, or the provided default value. * @ignore */ - static async getValue(key: string, defaultValue: T): Promise + static async getValue(key: string, defaultValue: T): Promise; /** * Gets a value from the default {@apilink KeyValueStore} associated with the current crawler run. * diff --git a/packages/core/src/storages/request_list.ts b/packages/core/src/storages/request_list.ts index 9eeb4f80e189..42b036967a8c 100644 --- a/packages/core/src/storages/request_list.ts +++ b/packages/core/src/storages/request_list.ts @@ -98,10 +98,10 @@ export interface RequestListOptions { sourcesFunction?: RequestListSourcesFunction; /** - * Used to pass the proxy configuration for the `requestsFromUrl` objects. - * Takes advantage of the internal address rotation and authentication process. - * If undefined, the `requestsFromUrl` requests will be made without proxy. - */ + * Used to pass the proxy configuration for the `requestsFromUrl` objects. + * Takes advantage of the internal address rotation and authentication process. + * If undefined, the `requestsFromUrl` requests will be made without proxy. + */ proxyConfiguration?: ProxyConfiguration; /** @@ -300,21 +300,27 @@ export class RequestList { } = options; if (!(sources || sourcesFunction)) { - throw new ArgumentError('At least one of "sources" or "sourcesFunction" must be provided.', this.constructor); + throw new ArgumentError( + 'At least one of "sources" or "sourcesFunction" must be provided.', + this.constructor, + ); } - ow(options, ow.object.exactShape({ - sources: ow.optional.array, // check only for array and not subtypes to avoid iteration over the whole thing - sourcesFunction: ow.optional.function, - persistStateKey: ow.optional.string, - persistRequestsKey: ow.optional.string, - state: ow.optional.object.exactShape({ - nextIndex: ow.number, - nextUniqueKey: ow.string, - inProgress: ow.object, + ow( + options, + ow.object.exactShape({ + sources: ow.optional.array, // check only for array and not subtypes to avoid iteration over the whole thing + sourcesFunction: ow.optional.function, + persistStateKey: ow.optional.string, + persistRequestsKey: ow.optional.string, + state: ow.optional.object.exactShape({ + nextIndex: ow.number, + nextUniqueKey: ow.string, + inProgress: ow.object, + }), + keepDuplicateUrls: ow.optional.boolean, + proxyConfiguration: ow.optional.object, }), - keepDuplicateUrls: ow.optional.boolean, - proxyConfiguration: ow.optional.object, - })); + ); this.persistStateKey = persistStateKey ? `SDK_${persistStateKey}` : persistStateKey; this.persistRequestsKey = persistRequestsKey ? `SDK_${persistRequestsKey}` : persistRequestsKey; @@ -472,16 +478,22 @@ export class RequestList { if (state.nextIndex > this.requests.length) { throw new Error('The state object is not consistent with RequestList, too few requests loaded.'); } - if (state.nextIndex < this.requests.length - && this.requests[state.nextIndex].uniqueKey !== state.nextUniqueKey) { - throw new Error('The state object is not consistent with RequestList the order of URLs seems to have changed.'); + if ( + state.nextIndex < this.requests.length && + this.requests[state.nextIndex].uniqueKey !== state.nextUniqueKey + ) { + throw new Error( + 'The state object is not consistent with RequestList the order of URLs seems to have changed.', + ); } const deleteFromInProgress: string[] = []; state.inProgress.forEach((uniqueKey) => { const index = this.uniqueKeyToIndex[uniqueKey]; if (typeof index !== 'number') { - throw new Error('The state object is not consistent with RequestList. Unknown uniqueKey is present in the state.'); + throw new Error( + 'The state object is not consistent with RequestList. Unknown uniqueKey is present in the state.', + ); } if (index >= state.nextIndex) { deleteFromInProgress.push(uniqueKey); @@ -505,9 +517,12 @@ export class RequestList { // As a workaround, we just remove all inProgress requests whose index >= nextIndex, // since they will be crawled again. if (deleteFromInProgress.length) { - this.log.warning('RequestList\'s in-progress field is not consistent, skipping invalid in-progress entries', { - deleteFromInProgress, - }); + this.log.warning( + "RequestList's in-progress field is not consistent, skipping invalid in-progress entries", + { + deleteFromInProgress, + }, + ); for (const uniqueKey of deleteFromInProgress) { this.inProgress.delete(uniqueKey); } @@ -550,9 +565,7 @@ export class RequestList { return { nextIndex: this.nextIndex, - nextUniqueKey: this.nextIndex < this.requests.length - ? this.requests[this.nextIndex].uniqueKey - : null, + nextUniqueKey: this.nextIndex < this.requests.length ? this.requests[this.nextIndex].uniqueKey : null, inProgress: [...this.inProgress], }; } @@ -674,7 +687,11 @@ export class RequestList { // Download remote resource and parse URLs. let urlsArr; try { - urlsArr = await this._downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex, proxyUrl: await this.proxyConfiguration?.newUrl() }); + urlsArr = await this._downloadListOfUrls({ + url: requestsFromUrl, + urlRegExp: regex, + proxyUrl: await this.proxyConfiguration?.newUrl(), + }); } catch (err) { throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`); } @@ -721,7 +738,9 @@ export class RequestList { this.uniqueKeyToIndex[uniqueKey] = this.requests.length; this.requests.push(request); } else if (this.keepDuplicateUrls) { - this.log.warning(`Duplicate uniqueKey: ${uniqueKey} found while the keepDuplicateUrls option was set. Check your sources' unique keys.`); + this.log.warning( + `Duplicate uniqueKey: ${uniqueKey} found while the keepDuplicateUrls option was set. Check your sources' unique keys.`, + ); } } @@ -731,7 +750,7 @@ export class RequestList { */ protected _ensureUniqueKeyValid(uniqueKey: string): void { if (typeof uniqueKey !== 'string' || !uniqueKey) { - throw new Error('Request object\'s uniqueKey must be a non-empty string'); + throw new Error("Request object's uniqueKey must be a non-empty string"); } } @@ -752,7 +771,9 @@ export class RequestList { */ protected _ensureIsInitialized(): void { if (!this.isInitialized) { - throw new Error('RequestList is not initialized; you must call "await requestList.initialize()" before using it!'); + throw new Error( + 'RequestList is not initialized; you must call "await requestList.initialize()" before using it!', + ); } } @@ -844,7 +865,10 @@ export class RequestList { ow(listName, ow.optional.any(ow.string, ow.null)); ow(sources, ow.array); - ow(options, ow.object.is((v) => !Array.isArray(v))); + ow( + options, + ow.object.is((v) => !Array.isArray(v)), + ); const rl = new RequestList({ ...options, @@ -860,7 +884,9 @@ export class RequestList { /** * @internal wraps public utility for mocking purposes */ - private async _downloadListOfUrls(options: { url: string; urlRegExp?: RegExp; proxyUrl?: string }): Promise { + private async _downloadListOfUrls(options: { url: string; urlRegExp?: RegExp; proxyUrl?: string }): Promise< + string[] + > { return downloadListOfUrls(options); } } @@ -882,7 +908,6 @@ export class RequestList { * ``` */ export interface RequestListState { - /** Position of the next request to be processed. */ nextIndex: number; @@ -891,7 +916,6 @@ export interface RequestListState { /** Array of request keys representing those that being processed at the moment. */ inProgress: string[]; - } type RequestListSource = string | Source; diff --git a/packages/core/src/storages/request_provider.ts b/packages/core/src/storages/request_provider.ts index 59efeb8a1f2d..99503884ede5 100644 --- a/packages/core/src/storages/request_provider.ts +++ b/packages/core/src/storages/request_provider.ts @@ -52,7 +52,10 @@ export abstract class RequestProvider implements IStorage { protected queuePausedForMigration = false; - constructor(options: InternalRequestProviderOptions, readonly config = Configuration.getGlobalConfig()) { + constructor( + options: InternalRequestProviderOptions, + readonly config = Configuration.getGlobalConfig(), + ) { this.id = options.id; this.name = options.name; this.client = options.client.requestQueue(this.id, { @@ -103,13 +106,19 @@ export abstract class RequestProvider implements IStorage { * Note that the function sets the `uniqueKey` and `id` fields to the passed Request. * @param [options] Request queue operation options. */ - async addRequest(requestLike: Source, options: RequestQueueOperationOptions = {}): Promise { + async addRequest( + requestLike: Source, + options: RequestQueueOperationOptions = {}, + ): Promise { checkStorageAccess(); ow(requestLike, ow.object); - ow(options, ow.object.exactShape({ - forefront: ow.optional.boolean, - })); + ow( + options, + ow.object.exactShape({ + forefront: ow.optional.boolean, + }), + ); const { forefront = false } = options; @@ -120,14 +129,15 @@ export abstract class RequestProvider implements IStorage { return processedRequests[0]; } - ow(requestLike, ow.object.partialShape({ - url: ow.string, - id: ow.undefined, - })); + ow( + requestLike, + ow.object.partialShape({ + url: ow.string, + id: ow.undefined, + }), + ); - const request = requestLike instanceof Request - ? requestLike - : new Request(requestLike); + const request = requestLike instanceof Request ? requestLike : new Request(requestLike); const cacheKey = getRequestId(request.uniqueKey); const cachedInfo = this.requestCache.get(cacheKey); @@ -144,13 +154,17 @@ export abstract class RequestProvider implements IStorage { }; } - const queueOperationInfo = await this.client.addRequest(request, { forefront }) as RequestQueueOperationInfo; + const queueOperationInfo = (await this.client.addRequest(request, { forefront })) as RequestQueueOperationInfo; queueOperationInfo.uniqueKey = request.uniqueKey; const { requestId, wasAlreadyPresent } = queueOperationInfo; this._cacheRequest(cacheKey, queueOperationInfo); - if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandledRequestsCache.get(requestId)) { + if ( + !wasAlreadyPresent && + !this.inProgress.has(requestId) && + !this.recentlyHandledRequestsCache.get(requestId) + ) { this.assumedTotalCount++; // Performance optimization: add request straight to head if possible @@ -178,9 +192,12 @@ export abstract class RequestProvider implements IStorage { checkStorageAccess(); ow(requestsLike, ow.array); - ow(options, ow.object.exactShape({ - forefront: ow.optional.boolean, - })); + ow( + options, + ow.object.exactShape({ + forefront: ow.optional.boolean, + }), + ); const { forefront = false } = options; @@ -255,7 +272,11 @@ export abstract class RequestProvider implements IStorage { const { requestId, wasAlreadyPresent } = newRequest; this._cacheRequest(cacheKey, newRequest); - if (!wasAlreadyPresent && !this.inProgress.has(requestId) && !this.recentlyHandledRequestsCache.get(requestId)) { + if ( + !wasAlreadyPresent && + !this.inProgress.has(requestId) && + !this.recentlyHandledRequestsCache.get(requestId) + ) { this.assumedTotalCount++; // Performance optimization: add request straight to head if possible @@ -275,32 +296,40 @@ export abstract class RequestProvider implements IStorage { * @param requests The requests to add * @param options Options for the request queue */ - async addRequestsBatched(requests: (string | Source)[], options: AddRequestsBatchedOptions = {}): Promise { + async addRequestsBatched( + requests: (string | Source)[], + options: AddRequestsBatchedOptions = {}, + ): Promise { checkStorageAccess(); - ow(requests, ow.array.ofType(ow.any( - ow.string, - ow.object.partialShape({ url: ow.string, id: ow.undefined }), - ow.object.partialShape({ requestsFromUrl: ow.string, regex: ow.optional.regExp }), - ))); - ow(options, ow.object.exactShape({ - forefront: ow.optional.boolean, - waitForAllRequestsToBeAdded: ow.optional.boolean, - batchSize: ow.optional.number, - waitBetweenBatchesMillis: ow.optional.number, - })); - - const { - batchSize = 1000, - waitBetweenBatchesMillis = 1000, - } = options; + ow( + requests, + ow.array.ofType( + ow.any( + ow.string, + ow.object.partialShape({ url: ow.string, id: ow.undefined }), + ow.object.partialShape({ requestsFromUrl: ow.string, regex: ow.optional.regExp }), + ), + ), + ); + ow( + options, + ow.object.exactShape({ + forefront: ow.optional.boolean, + waitForAllRequestsToBeAdded: ow.optional.boolean, + batchSize: ow.optional.number, + waitBetweenBatchesMillis: ow.optional.number, + }), + ); + + const { batchSize = 1000, waitBetweenBatchesMillis = 1000 } = options; const builtRequests: Request[] = []; for (const opts of requests) { if (opts && typeof opts === 'object' && 'requestsFromUrl' in opts) { await this.addRequest(opts, { forefront: options.forefront }); } else { - builtRequests.push(new Request(typeof opts === 'string' ? { url: opts } : opts as RequestOptions)); + builtRequests.push(new Request(typeof opts === 'string' ? { url: opts } : (opts as RequestOptions))); } } @@ -312,9 +341,13 @@ export abstract class RequestProvider implements IStorage { if (apiResult.unprocessedRequests.length) { await sleep(waitBetweenBatchesMillis); - resultsToReturn.push(...await attemptToAddToQueueAndAddAnyUnprocessed( - providedRequests.filter((r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey)), - )); + resultsToReturn.push( + ...(await attemptToAddToQueueAndAddAnyUnprocessed( + providedRequests.filter( + (r) => !apiResult.processedRequests.some((pr) => pr.uniqueKey === r.uniqueKey), + ), + )), + ); } return resultsToReturn; @@ -339,7 +372,7 @@ export abstract class RequestProvider implements IStorage { const finalAddedRequests: ProcessedRequest[] = []; for (const requestChunk of chunks) { - finalAddedRequests.push(...await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk)); + finalAddedRequests.push(...(await attemptToAddToQueueAndAddAnyUnprocessed(requestChunk))); await sleep(waitBetweenBatchesMillis); } @@ -349,7 +382,7 @@ export abstract class RequestProvider implements IStorage { // If the user wants to wait for all the requests to be added, we wait for the promise to resolve for them if (options.waitForAllRequestsToBeAdded) { - addedRequests.push(...await promise); + addedRequests.push(...(await promise)); } return { @@ -386,19 +419,27 @@ export abstract class RequestProvider implements IStorage { async markRequestHandled(request: Request): Promise { checkStorageAccess(); - ow(request, ow.object.partialShape({ - id: ow.string, - uniqueKey: ow.string, - handledAt: ow.optional.string, - })); + ow( + request, + ow.object.partialShape({ + id: ow.string, + uniqueKey: ow.string, + handledAt: ow.optional.string, + }), + ); if (!this.inProgress.has(request.id)) { - this.log.debug(`Cannot mark request ${request.id} as handled, because it is not in progress!`, { requestId: request.id }); + this.log.debug(`Cannot mark request ${request.id} as handled, because it is not in progress!`, { + requestId: request.id, + }); return null; } const handledAt = request.handledAt ?? new Date().toISOString(); - const queueOperationInfo = await this.client.updateRequest({ ...request, handledAt }) as RequestQueueOperationInfo; + const queueOperationInfo = (await this.client.updateRequest({ + ...request, + handledAt, + })) as RequestQueueOperationInfo; request.handledAt = handledAt; queueOperationInfo.uniqueKey = request.uniqueKey; @@ -420,27 +461,40 @@ export abstract class RequestProvider implements IStorage { * The request record in the queue is updated using the provided `request` parameter. * For example, this lets you store the number of retries or error messages for the request. */ - async reclaimRequest(request: Request, options: RequestQueueOperationOptions = {}): Promise { + async reclaimRequest( + request: Request, + options: RequestQueueOperationOptions = {}, + ): Promise { checkStorageAccess(); - ow(request, ow.object.partialShape({ - id: ow.string, - uniqueKey: ow.string, - })); - ow(options, ow.object.exactShape({ - forefront: ow.optional.boolean, - })); + ow( + request, + ow.object.partialShape({ + id: ow.string, + uniqueKey: ow.string, + }), + ); + ow( + options, + ow.object.exactShape({ + forefront: ow.optional.boolean, + }), + ); const { forefront = false } = options; if (!this.inProgress.has(request.id)) { - this.log.debug(`Cannot reclaim request ${request.id}, because it is not in progress!`, { requestId: request.id }); + this.log.debug(`Cannot reclaim request ${request.id}, because it is not in progress!`, { + requestId: request.id, + }); return null; } // TODO: If request hasn't been changed since the last getRequest(), // we don't need to call updateRequest() and thus improve performance. - const queueOperationInfo = await this.client.updateRequest(request, { forefront }) as RequestQueueOperationInfo; + const queueOperationInfo = (await this.client.updateRequest(request, { + forefront, + })) as RequestQueueOperationInfo; queueOperationInfo.uniqueKey = request.uniqueKey; this._cacheRequest(getRequestId(request.uniqueKey), queueOperationInfo); @@ -448,7 +502,9 @@ export abstract class RequestProvider implements IStorage { // This is to compensate for the limitation of DynamoDB, where writes might not be immediately visible to subsequent reads. setTimeout(() => { if (!this.inProgress.has(request.id)) { - this.log.debug('The request is no longer marked as in progress in the queue?!', { requestId: request.id }); + this.log.debug('The request is no longer marked as in progress in the queue?!', { + requestId: request.id, + }); return; } @@ -543,7 +599,7 @@ export abstract class RequestProvider implements IStorage { */ async handledCount(): Promise { // NOTE: We keep this function for compatibility with RequestList.handledCount() - const { handledRequestCount } = await this.getInfo() ?? {}; + const { handledRequestCount } = (await this.getInfo()) ?? {}; return handledRequestCount ?? 0; } @@ -586,7 +642,11 @@ export abstract class RequestProvider implements IStorage { // Download remote resource and parse URLs. let urlsArr; try { - urlsArr = await this._downloadListOfUrls({ url: requestsFromUrl, urlRegExp: regex, proxyUrl: await this.proxyConfiguration?.newUrl() }); + urlsArr = await this._downloadListOfUrls({ + url: requestsFromUrl, + urlRegExp: regex, + proxyUrl: await this.proxyConfiguration?.newUrl(), + }); } catch (err) { throw new Error(`Cannot fetch a request list from ${requestsFromUrl}: ${err}`); } @@ -603,7 +663,11 @@ export abstract class RequestProvider implements IStorage { /** * Adds all fetched requests from a URL from a remote resource. */ - protected async _addFetchedRequests(source: InternalSource, fetchedRequests: RequestOptions[], options: RequestQueueOperationOptions) { + protected async _addFetchedRequests( + source: InternalSource, + fetchedRequests: RequestOptions[], + options: RequestQueueOperationOptions, + ) { const { requestsFromUrl, regex } = source; const { addedRequests } = await this.addRequestsBatched(fetchedRequests, options); @@ -622,7 +686,9 @@ export abstract class RequestProvider implements IStorage { /** * @internal wraps public utility for mocking purposes */ - private async _downloadListOfUrls(options: { url: string; urlRegExp?: RegExp; proxyUrl?: string }): Promise { + private async _downloadListOfUrls(options: { url: string; urlRegExp?: RegExp; proxyUrl?: string }): Promise< + string[] + > { return downloadListOfUrls(options); } @@ -646,11 +712,14 @@ export abstract class RequestProvider implements IStorage { checkStorageAccess(); ow(queueIdOrName, ow.optional.any(ow.string, ow.null)); - ow(options, ow.object.exactShape({ - config: ow.optional.object.instanceOf(Configuration), - storageClient: ow.optional.object, - proxyConfiguration: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + config: ow.optional.object.instanceOf(Configuration), + storageClient: ow.optional.object, + proxyConfiguration: ow.optional.object, + }), + ); options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); @@ -669,7 +738,9 @@ export abstract class RequestProvider implements IStorage { } declare class BuiltRequestProvider extends RequestProvider { - override fetchNextRequest(options?: RequestOptions | undefined): Promise | null>; + override fetchNextRequest( + options?: RequestOptions | undefined, + ): Promise | null>; protected override ensureHeadIsNonEmpty(): Promise; } diff --git a/packages/core/src/storages/request_queue.ts b/packages/core/src/storages/request_queue.ts index d4e6aa35b391..2334b187b3a8 100644 --- a/packages/core/src/storages/request_queue.ts +++ b/packages/core/src/storages/request_queue.ts @@ -1,8 +1,7 @@ import { setTimeout as sleep } from 'node:timers/promises'; import { REQUEST_QUEUE_HEAD_MAX_LIMIT } from '@apify/consts'; -import type { Dictionary, -} from '@crawlee/types'; +import type { Dictionary } from '@crawlee/types'; import { checkStorageAccess } from './access_checking'; import type { RequestProviderOptions } from './request_provider'; @@ -88,12 +87,15 @@ export class RequestQueue extends RequestProvider { * @internal */ constructor(options: RequestProviderOptions, config = Configuration.getGlobalConfig()) { - super({ - ...options, - logPrefix: 'RequestQueue', - recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE, - requestCacheMaxSize: MAX_CACHED_REQUESTS, - }, config); + super( + { + ...options, + logPrefix: 'RequestQueue', + recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE, + requestCacheMaxSize: MAX_CACHED_REQUESTS, + }, + config, + ); } /** @@ -153,7 +155,9 @@ export class RequestQueue extends RequestProvider { // into the queueHeadDict straight again. After the interval expires, fetchNextRequest() // will try to fetch this request again, until it eventually appears in the main table. if (!request) { - this.log.debug('Cannot find a request from the beginning of queue, will be retried later', { nextRequestId }); + this.log.debug('Cannot find a request from the beginning of queue, will be retried later', { + nextRequestId, + }); setTimeout(() => { this.inProgress.delete(nextRequestId); }, STORAGE_CONSISTENCY_DELAY_MILLIS); @@ -212,7 +216,13 @@ export class RequestQueue extends RequestProvider { .then(({ items, queueModifiedAt, hadMultipleClients }) => { items.forEach(({ id: requestId, uniqueKey }) => { // Queue head index might be behind the main table, so ensure we don't recycle requests - if (!requestId || !uniqueKey || this.inProgress.has(requestId) || this.recentlyHandledRequestsCache.get(requestId!)) return; + if ( + !requestId || + !uniqueKey || + this.inProgress.has(requestId) || + this.recentlyHandledRequestsCache.get(requestId!) + ) + return; this.queueHeadIds.add(requestId, requestId, false); this._cacheRequest(getRequestId(uniqueKey), { @@ -236,7 +246,8 @@ export class RequestQueue extends RequestProvider { }); } - const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = await this.queryQueueHeadPromise; + const { queueModifiedAt, wasLimitReached, prevLimit, queryStartedAt, hadMultipleClients } = + await this.queryQueueHeadPromise; // TODO: I feel this code below can be greatly simplified... @@ -250,9 +261,8 @@ export class RequestQueue extends RequestProvider { if (prevLimit >= REQUEST_QUEUE_HEAD_MAX_LIMIT) { this.log.warning(`Reached the maximum number of requests in progress: ${REQUEST_QUEUE_HEAD_MAX_LIMIT}.`); } - const shouldRepeatWithHigherLimit = this.queueHeadIds.length() === 0 - && wasLimitReached - && prevLimit < REQUEST_QUEUE_HEAD_MAX_LIMIT; + const shouldRepeatWithHigherLimit = + this.queueHeadIds.length() === 0 && wasLimitReached && prevLimit < REQUEST_QUEUE_HEAD_MAX_LIMIT; // If ensureConsistency=true then we must ensure that either: // - queueModifiedAt is older than queryStartedAt by at least API_PROCESSED_REQUESTS_DELAY_MILLIS @@ -269,14 +279,14 @@ export class RequestQueue extends RequestProvider { // If this is reached then we return false so that empty() and finished() returns possibly false negative. if (!shouldRepeatWithHigherLimit && iteration > MAX_QUERIES_FOR_CONSISTENCY) return false; - const nextLimit = shouldRepeatWithHigherLimit - ? Math.round(prevLimit * 1.5) - : prevLimit; + const nextLimit = shouldRepeatWithHigherLimit ? Math.round(prevLimit * 1.5) : prevLimit; // If we are repeating for consistency then wait required time. if (shouldRepeatForConsistency) { const delayMillis = API_PROCESSED_REQUESTS_DELAY_MILLIS - (Date.now() - +queueModifiedAt); - this.log.info(`Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`); + this.log.info( + `Waiting for ${delayMillis}ms before considering the queue as finished to ensure that the data is consistent.`, + ); await sleep(delayMillis); } @@ -287,8 +297,10 @@ export class RequestQueue extends RequestProvider { override async isFinished(): Promise { checkStorageAccess(); - if ((Date.now() - +this.lastActivity) > this.internalTimeoutMillis) { - const message = `The request queue seems to be stuck for ${this.internalTimeoutMillis / 1e3}s, resetting internal state.`; + if (Date.now() - +this.lastActivity > this.internalTimeoutMillis) { + const message = `The request queue seems to be stuck for ${ + this.internalTimeoutMillis / 1e3 + }s, resetting internal state.`; this.log.warning(message, { inProgress: [...this.inProgress] }); this._reset(); } diff --git a/packages/core/src/storages/request_queue_v2.ts b/packages/core/src/storages/request_queue_v2.ts index 0808bd4a91dc..976b01b2bd96 100644 --- a/packages/core/src/storages/request_queue_v2.ts +++ b/packages/core/src/storages/request_queue_v2.ts @@ -3,10 +3,7 @@ import type { Dictionary } from '@crawlee/types'; import { checkStorageAccess } from './access_checking'; import type { RequestQueueOperationInfo, RequestProviderOptions } from './request_provider'; import { RequestProvider } from './request_provider'; -import { - STORAGE_CONSISTENCY_DELAY_MILLIS, - getRequestId, -} from './utils'; +import { STORAGE_CONSISTENCY_DELAY_MILLIS, getRequestId } from './utils'; import { Configuration } from '../configuration'; import { EventType } from '../events'; import type { Request } from '../request'; @@ -25,12 +22,15 @@ class RequestQueue extends RequestProvider { private _listHeadAndLockPromise: Promise | null = null; constructor(options: RequestProviderOptions, config = Configuration.getGlobalConfig()) { - super({ - ...options, - logPrefix: 'RequestQueue2', - recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE, - requestCacheMaxSize: MAX_CACHED_REQUESTS, - }, config); + super( + { + ...options, + logPrefix: 'RequestQueue2', + recentlyHandledRequestsMaxSize: RECENTLY_HANDLED_CACHE_SIZE, + requestCacheMaxSize: MAX_CACHED_REQUESTS, + }, + config, + ); const eventManager = config.getEventManager(); @@ -119,7 +119,9 @@ class RequestQueue extends RequestProvider { // into the queueHeadDict straight again. After the interval expires, fetchNextRequest() // will try to fetch this request again, until it eventually appears in the main table. if (!request) { - this.log.debug('Cannot find a request from the beginning of queue or lost lock, will be retried later', { nextRequestId }); + this.log.debug('Cannot find a request from the beginning of queue or lost lock, will be retried later', { + nextRequestId, + }); setTimeout(() => { this.inProgress.delete(nextRequestId); @@ -141,7 +143,9 @@ class RequestQueue extends RequestProvider { return request; } - override async reclaimRequest(...args: Parameters): ReturnType { + override async reclaimRequest( + ...args: Parameters + ): ReturnType { checkStorageAccess(); const res = await super.reclaimRequest(...args); @@ -201,7 +205,9 @@ class RequestQueue extends RequestProvider { } } - private async getOrHydrateRequest(requestId: string): Promise | null> { + private async getOrHydrateRequest( + requestId: string, + ): Promise | null> { checkStorageAccess(); const cachedEntry = this.requestCache.get(requestId); @@ -293,9 +299,12 @@ class RequestQueue extends RequestProvider { return res.lockExpiresAt; } catch (err: any) { // Most likely we do not own the lock anymore - this.log.warning(`Failed to prolong lock for cached request ${requestId}, either lost the lock or the request was already handled\n`, { - err, - }); + this.log.warning( + `Failed to prolong lock for cached request ${requestId}, either lost the lock or the request was already handled\n`, + { + err, + }, + ); return null; } diff --git a/packages/core/src/storages/storage_manager.ts b/packages/core/src/storages/storage_manager.ts index 2e369654d790..86eacb7dc6ae 100644 --- a/packages/core/src/storages/storage_manager.ts +++ b/packages/core/src/storages/storage_manager.ts @@ -108,11 +108,15 @@ export class StorageManager { /** * Helper function that first requests storage by ID and if storage doesn't exist then gets it by name. */ - protected async _getOrCreateStorage(storageIdOrName: string, storageConstructorName: string, apiClient: StorageClient) { - const { - createStorageClient, - createStorageCollectionClient, - } = this._getStorageClientFactories(apiClient, storageConstructorName); + protected async _getOrCreateStorage( + storageIdOrName: string, + storageConstructorName: string, + apiClient: StorageClient, + ) { + const { createStorageClient, createStorageCollectionClient } = this._getStorageClientFactories( + apiClient, + storageConstructorName, + ); const storageClient = createStorageClient(storageIdOrName); const existingStorage = await storageClient.get(); @@ -124,7 +128,7 @@ export class StorageManager { protected _getStorageClientFactories(client: StorageClient, storageConstructorName: string) { // Dataset => dataset - const clientName = storageConstructorName[0].toLowerCase() + storageConstructorName.slice(1) as ClientNames; + const clientName = (storageConstructorName[0].toLowerCase() + storageConstructorName.slice(1)) as ClientNames; // dataset => datasets const collectionClientName = `${clientName}s` as ClientCollectionNames; diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index 72b08b53211c..87ef3c5d8d38 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -47,14 +47,14 @@ export async function purgeDefaultStorages( configOrOptions?: Configuration | PurgeDefaultStorageOptions, client?: StorageClient, ) { - const options: PurgeDefaultStorageOptions = configOrOptions instanceof Configuration ? { - client, - config: configOrOptions, - } : configOrOptions ?? {}; - const { - config = Configuration.getGlobalConfig(), - onlyPurgeOnce = false, - } = options; + const options: PurgeDefaultStorageOptions = + configOrOptions instanceof Configuration + ? { + client, + config: configOrOptions, + } + : configOrOptions ?? {}; + const { config = Configuration.getGlobalConfig(), onlyPurgeOnce = false } = options; ({ client = config.getStorageClient() } = options); const casted = client as StorageClient & { __purged?: boolean }; @@ -89,7 +89,9 @@ export async function useState( defaultValue = {} as State, options?: UseStateOptions, ) { - const kvStore = await KeyValueStore.open(options?.keyValueStoreName, { config: options?.config || Configuration.getGlobalConfig() }); + const kvStore = await KeyValueStore.open(options?.keyValueStoreName, { + config: options?.config || Configuration.getGlobalConfig(), + }); return kvStore.getAutoSavedValue(name || 'CRAWLEE_GLOBAL_STATE', defaultValue); } @@ -103,11 +105,7 @@ export async function useState( * @internal */ export function getRequestId(uniqueKey: string) { - const str = crypto - .createHash('sha256') - .update(uniqueKey) - .digest('base64') - .replace(/[+/=]/g, ''); + const str = crypto.createHash('sha256').update(uniqueKey).digest('base64').replace(/[+/=]/g, ''); return str.slice(0, 15); } diff --git a/packages/core/src/typedefs.ts b/packages/core/src/typedefs.ts index 66fc714ecd73..6f2dcaf39a5a 100644 --- a/packages/core/src/typedefs.ts +++ b/packages/core/src/typedefs.ts @@ -16,4 +16,13 @@ export function keys(obj: T) { return Object.keys(obj) as (keyof T)[]; } -export declare type AllowedHttpMethods = 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; +export declare type AllowedHttpMethods = + | 'GET' + | 'HEAD' + | 'POST' + | 'PUT' + | 'DELETE' + | 'TRACE' + | 'OPTIONS' + | 'CONNECT' + | 'PATCH'; diff --git a/packages/core/test/enqueue_links/userData.test.ts b/packages/core/test/enqueue_links/userData.test.ts index 722f4fbf6d6d..298f27ba4ab7 100644 --- a/packages/core/test/enqueue_links/userData.test.ts +++ b/packages/core/test/enqueue_links/userData.test.ts @@ -33,7 +33,7 @@ function createRequestQueueMock() { return { enqueued, requestQueue }; } -describe('enqueueLinks() - userData shouldn\'t be changed and outer label must take priority', () => { +describe("enqueueLinks() - userData shouldn't be changed and outer label must take priority", () => { let ll: number; beforeAll(() => { ll = log.getLevel(); @@ -83,7 +83,7 @@ describe('enqueueLinks() - userData shouldn\'t be changed and outer label must t expect(enqueued[1].userData.label).toBe('second'); }); - test('JSON string of userData shouldn\'t change, but enqueued label should be different', async () => { + test("JSON string of userData shouldn't change, but enqueued label should be different", async () => { const { enqueued, requestQueue } = createRequestQueueMock(); const userData = { foo: 'bar', label: 'bogus' }; diff --git a/packages/core/test/request-queue/adding-the-same-request-should-not-call-the-api.test.ts b/packages/core/test/request-queue/adding-the-same-request-should-not-call-the-api.test.ts index 5e8e0ebd8713..086c22dcc7a3 100644 --- a/packages/core/test/request-queue/adding-the-same-request-should-not-call-the-api.test.ts +++ b/packages/core/test/request-queue/adding-the-same-request-should-not-call-the-api.test.ts @@ -12,7 +12,9 @@ afterAll(() => { let requestQueueInfo: RequestQueueInfo; beforeAll(async () => { - requestQueueInfo = await Configuration.getStorageClient().requestQueues().getOrCreate('test-request-queue-not-called-on-cached-request'); + requestQueueInfo = await Configuration.getStorageClient() + .requestQueues() + .getOrCreate('test-request-queue-not-called-on-cached-request'); }); describe('RequestQueue#addRequest should not call the API if the request is already in the queue', () => { @@ -43,7 +45,10 @@ describe('RequestQueue#addRequests should not call the API if the request is alr expect(clientSpy).toHaveBeenCalledTimes(1); - await requestQueue.markRequestHandled({ id: requestData.processedRequests[0].requestId, uniqueKey: requestData.processedRequests[0].uniqueKey } as any); + await requestQueue.markRequestHandled({ + id: requestData.processedRequests[0].requestId, + uniqueKey: requestData.processedRequests[0].uniqueKey, + } as any); await requestQueue.addRequests([{ url: 'https://example2.com' }]); diff --git a/packages/core/test/request-queue/request-queue-v2.test.ts b/packages/core/test/request-queue/request-queue-v2.test.ts index db138122af84..a9c85ddd46de 100644 --- a/packages/core/test/request-queue/request-queue-v2.test.ts +++ b/packages/core/test/request-queue/request-queue-v2.test.ts @@ -1,6 +1,13 @@ /* eslint-disable dot-notation */ import { MemoryStorage } from '@crawlee/memory-storage'; -import type { ListAndLockHeadResult, ListAndLockOptions, ListOptions, ProlongRequestLockOptions, ProlongRequestLockResult, QueueHead } from '@crawlee/types'; +import type { + ListAndLockHeadResult, + ListAndLockOptions, + ListOptions, + ProlongRequestLockOptions, + ProlongRequestLockResult, + QueueHead, +} from '@crawlee/types'; import { RequestQueueV2 } from 'crawlee'; import type { SpyInstance } from 'vitest'; @@ -13,10 +20,7 @@ async function makeQueue(name: string, numOfRequestsToAdd = 0) { if (numOfRequestsToAdd) { await queue.addRequests( - Array.from( - { length: numOfRequestsToAdd }, - (_, i) => ({ url: 'https://example.com', uniqueKey: `${i}` }), - ), + Array.from({ length: numOfRequestsToAdd }, (_, i) => ({ url: 'https://example.com', uniqueKey: `${i}` })), ); } @@ -92,7 +96,10 @@ describe('RequestQueueV2#fetchNextRequest should use locking API', () => { let queue: RequestQueueV2; let clientListHeadSpy: SpyInstance<[options?: ListOptions | undefined], Promise>; let clientListAndLockHeadSpy: SpyInstance<[options: ListAndLockOptions], Promise>; - let clientProlongLockSpy: SpyInstance<[id: string, options: ProlongRequestLockOptions], Promise>; + let clientProlongLockSpy: SpyInstance< + [id: string, options: ProlongRequestLockOptions], + Promise + >; let listAndLockHeadCallCount = 0; beforeAll(async () => { diff --git a/packages/core/tsconfig.build.json b/packages/core/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/core/tsconfig.build.json +++ b/packages/core/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/crawlee/tsconfig.build.json b/packages/crawlee/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/crawlee/tsconfig.build.json +++ b/packages/crawlee/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 5549d09257bf..d019cbbb60ab 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -33,7 +33,14 @@ import * as cheerio from 'cheerio'; import type { RequestLike, ResponseLike } from 'content-type'; import contentTypeParser from 'content-type'; // @ts-expect-error This throws a compilation error due to got-scraping being ESM only but we only import types, so its alllll gooooood -import type { OptionsInit, Method, Request as GotRequest, Options, PlainResponse, TimeoutError as TimeoutErrorClass } from 'got-scraping'; +import type { + OptionsInit, + Method, + Request as GotRequest, + Options, + PlainResponse, + TimeoutError as TimeoutErrorClass, +} from 'got-scraping'; import iconv from 'iconv-lite'; import mime from 'mime-types'; import ow from 'ow'; @@ -60,9 +67,10 @@ const HTTP_OPTIMIZED_AUTOSCALED_POOL_OPTIONS: AutoscaledPoolOptions = { export type HttpErrorHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = ErrorHandler>; +> = ErrorHandler>; -export interface HttpCrawlerOptions extends BasicCrawlerOptions { +export interface HttpCrawlerOptions + extends BasicCrawlerOptions { /** * An alias for {@apilink HttpCrawlerOptions.requestHandler} * Soon to be removed, use `requestHandler` instead. @@ -173,10 +181,7 @@ export interface HttpCrawlerOptions = ( - crawlingContext: Context, - gotOptions: OptionsInit, -) => Awaitable; +export type InternalHttpHook = (crawlingContext: Context, gotOptions: OptionsInit) => Awaitable; export type HttpHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler @@ -190,14 +195,14 @@ export interface InternalHttpCrawlingContext< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends JsonValue = any, // with default to Dictionary we cant use a typed router in untyped crawler Crawler = HttpCrawler, - > extends CrawlingContext { +> extends CrawlingContext { /** * The request body of the web page. * The type depends on the `Content-Type` header of the web page: * - String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types * - Buffer for others MIME content types */ - body: (string | Buffer); + body: string | Buffer; /** * The parsed object from JSON string if the response contains the content type application/json. @@ -219,7 +224,7 @@ export interface HttpCrawlingContext = RequestHandler>; +> = RequestHandler>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests. @@ -288,7 +293,9 @@ export type HttpRequestHandler< * ``` * @category Crawlers */ -export class HttpCrawler>> extends BasicCrawler { +export class HttpCrawler< + Context extends InternalHttpCrawlingContext>, +> extends BasicCrawler { /** * A reference to the underlying {@apilink ProxyConfiguration} class that manages the crawler's proxies. * Only available if used by the crawler. @@ -329,7 +336,10 @@ export class HttpCrawler = {}, override readonly config = Configuration.getGlobalConfig()) { + constructor( + options: HttpCrawlerOptions = {}, + override readonly config = Configuration.getGlobalConfig(), + ) { ow(options, 'HttpCrawlerOptions', ow.object.exactShape(HttpCrawler.optionsShape)); const { @@ -357,14 +367,18 @@ export class HttpCrawler { + protected async _requestFunction({ + request, + session, + proxyUrl, + gotOptions, + }: RequestFunctionOptions): Promise { if (!TimeoutError) { ({ TimeoutError } = await import('got-scraping')); } @@ -689,11 +715,20 @@ export class HttpCrawler Promise.resolve({ processedRequests: [], unprocessedRequests: [] }) }; + return { + body, + response, + contentType, + enqueueLinks: async () => Promise.resolve({ processedRequests: [], unprocessedRequests: [] }), + }; } } - protected async _parseHTML(response: IncomingMessage, _isXml: boolean, _crawlingContext: Context): Promise> { + protected async _parseHTML( + response: IncomingMessage, + _isXml: boolean, + _crawlingContext: Context, + ): Promise> { return { body: await concatStreamToBuffer(response), } as Partial; @@ -737,7 +772,11 @@ export class HttpCrawler = ErrorHandler>; +> = ErrorHandler>; export interface JSDOMCrawlerOptions< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends HttpCrawlerOptions> { +> extends HttpCrawlerOptions> { /** * Download and run scripts. */ @@ -49,12 +49,12 @@ export interface JSDOMCrawlerOptions< export type JSDOMHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = InternalHttpHook>; +> = InternalHttpHook>; export interface JSDOMCrawlingContext< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends InternalHttpCrawlingContext { +> extends InternalHttpCrawlingContext { window: DOMWindow; document: Document; @@ -75,7 +75,7 @@ export interface JSDOMCrawlingContext< export type JSDOMRequestHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = RequestHandler>; +> = RequestHandler>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests and @@ -153,7 +153,8 @@ export type JSDOMRequestHandler< const resources = new ResourceLoader({ // Copy from /packages/browser-pool/src/abstract-classes/browser-plugin.ts:17 // in order not to include the entire package here - userAgent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', }); export class JSDOMCrawler extends HttpCrawler { @@ -168,11 +169,7 @@ export class JSDOMCrawler extends HttpCrawler { protected virtualConsole: VirtualConsole | null = null; constructor(options: JSDOMCrawlerOptions = {}, config?: Configuration) { - const { - runScripts = false, - hideInternalConsole = false, - ...httpOptions - } = options; + const { runScripts = false, hideInternalConsole = false, ...httpOptions } = options; super(httpOptions, config); @@ -217,7 +214,11 @@ export class JSDOMCrawler extends HttpCrawler { context.window?.close(); } - protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: JSDOMCrawlingContext) { + protected override async _parseHTML( + response: IncomingMessage, + isXml: boolean, + crawlingContext: JSDOMCrawlingContext, + ) { const body = await concatStreamToBuffer(response); const { window } = new JSDOM(body, { @@ -245,20 +246,28 @@ export class JSDOMCrawler extends HttpCrawler { }); window.document.createRange = () => { const range = new window.Range(); - range.getBoundingClientRect = () => ({} as any); + range.getBoundingClientRect = () => ({}) as any; range.getClientRects = () => ({ item: () => null as any, length: 0 }) as any; return range; }; if (this.runScripts) { try { - await addTimeoutToPromise(async () => { - return new Promise((resolve) => { - window.addEventListener('load', () => { - resolve(); - }, false); - }).catch(); - }, 10_000, 'Window.load event not fired after 10 seconds.').catch(); + await addTimeoutToPromise( + async () => { + return new Promise((resolve) => { + window.addEventListener( + 'load', + () => { + resolve(); + }, + false, + ); + }).catch(); + }, + 10_000, + 'Window.load event not fired after 10 seconds.', + ).catch(); } catch (e) { this.log.debug((e as Error).message); } @@ -299,7 +308,13 @@ interface EnqueueLinksInternalOptions { } /** @internal */ -export async function domCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl }: EnqueueLinksInternalOptions) { +export async function domCrawlerEnqueueLinks({ + options, + window, + requestQueue, + originalRequestUrl, + finalRequestUrl, +}: EnqueueLinksInternalOptions) { if (!window) { throw new Error('Cannot enqueue links because the JSDOM is not available.'); } @@ -311,7 +326,11 @@ export async function domCrawlerEnqueueLinks({ options, window, requestQueue, or userProvidedBaseUrl: options?.baseUrl, }); - const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); + const urls = extractUrlsFromWindow( + window, + options?.selector ?? 'a', + options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl, + ); return enqueueLinks({ requestQueue, diff --git a/packages/jsdom-crawler/tsconfig.build.json b/packages/jsdom-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/jsdom-crawler/tsconfig.build.json +++ b/packages/jsdom-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts index b59931ca443f..8833f259573e 100644 --- a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts +++ b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts @@ -12,7 +12,13 @@ import type { RouterRoutes, RequestProvider, } from '@crawlee/http'; -import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering, tryAbsoluteURL } from '@crawlee/http'; +import { + HttpCrawler, + enqueueLinks, + Router, + resolveBaseUrlForEnqueueLinksFiltering, + tryAbsoluteURL, +} from '@crawlee/http'; import type { Dictionary } from '@crawlee/types'; import type * as cheerio from 'cheerio'; // @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too @@ -21,24 +27,24 @@ import { DOMParser } from 'linkedom/cached'; export type LinkeDOMErrorHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = ErrorHandler>; +> = ErrorHandler>; export interface LinkeDOMCrawlerOptions< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends HttpCrawlerOptions> {} +> extends HttpCrawlerOptions> {} export interface LinkeDOMCrawlerEnqueueLinksOptions extends Omit {} export type LinkeDOMHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = InternalHttpHook>; +> = InternalHttpHook>; export interface LinkeDOMCrawlingContext< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > extends InternalHttpCrawlingContext { +> extends InternalHttpCrawlingContext { window: Window; // Technically the document is not of type Document but of type either HTMLDocument or XMLDocument // from linkedom/types/{html/xml}/document, depending on the content type of the response @@ -64,7 +70,7 @@ export interface LinkeDOMCrawlingContext< export type LinkeDOMRequestHandler< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler - > = RequestHandler>; +> = RequestHandler>; /** * Provides a framework for the parallel crawling of web pages using plain HTTP requests and @@ -140,7 +146,11 @@ export type LinkeDOMRequestHandler< export class LinkeDOMCrawler extends HttpCrawler { private static parser = new DOMParser(); - protected override async _parseHTML(response: IncomingMessage, isXml: boolean, crawlingContext: LinkeDOMCrawlingContext) { + protected override async _parseHTML( + response: IncomingMessage, + isXml: boolean, + crawlingContext: LinkeDOMCrawlingContext, + ) { const body = await concatStreamToBuffer(response); const document = LinkeDOMCrawler.parser.parseFromString(body.toString(), isXml ? 'text/xml' : 'text/html'); @@ -176,7 +186,13 @@ interface EnqueueLinksInternalOptions { } /** @internal */ -export async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl }: EnqueueLinksInternalOptions) { +export async function linkedomCrawlerEnqueueLinks({ + options, + window, + requestQueue, + originalRequestUrl, + finalRequestUrl, +}: EnqueueLinksInternalOptions) { if (!window) { throw new Error('Cannot enqueue links because the DOM is not available.'); } @@ -188,7 +204,11 @@ export async function linkedomCrawlerEnqueueLinks({ options, window, requestQueu userProvidedBaseUrl: options?.baseUrl, }); - const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl); + const urls = extractUrlsFromWindow( + window, + options?.selector ?? 'a', + options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl, + ); return enqueueLinks({ requestQueue, diff --git a/packages/linkedom-crawler/tsconfig.build.json b/packages/linkedom-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/linkedom-crawler/tsconfig.build.json +++ b/packages/linkedom-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/memory-storage/src/background-handler/fs-utils.ts b/packages/memory-storage/src/background-handler/fs-utils.ts index 91368b6365e1..ea1419b42fc2 100644 --- a/packages/memory-storage/src/background-handler/fs-utils.ts +++ b/packages/memory-storage/src/background-handler/fs-utils.ts @@ -19,7 +19,9 @@ export async function handleMessage(message: BackgroundHandlerReceivedMessage) { default: // We're keeping this to make eslint happy + in the event we add a new action without adding checks for it // we should be aware of them - backgroundHandlerLog.warning(`Unknown background handler message action ${(message as BackgroundHandlerReceivedMessage).action}`); + backgroundHandlerLog.warning( + `Unknown background handler message action ${(message as BackgroundHandlerReceivedMessage).action}`, + ); } } @@ -38,18 +40,29 @@ async function updateMetadata(message: BackgroundHandlerUpdateMetadataMessage) { await writeFileP(filePath, JSON.stringify(message.data, null, '\t')); } -export async function lockAndWrite(filePath: string, data: unknown, stringify = true, retry = 10, timeout = 10): Promise { - await lockAndCallback(filePath, async () => { - await new Promise((pResolve, reject) => { - writeFile(filePath, stringify ? JSON.stringify(data, null, '\t') : data as Buffer, (err) => { - if (err) { - reject(err); - } else { - pResolve(); - } +export async function lockAndWrite( + filePath: string, + data: unknown, + stringify = true, + retry = 10, + timeout = 10, +): Promise { + await lockAndCallback( + filePath, + async () => { + await new Promise((pResolve, reject) => { + writeFile(filePath, stringify ? JSON.stringify(data, null, '\t') : (data as Buffer), (err) => { + if (err) { + reject(err); + } else { + pResolve(); + } + }); }); - }); - }, retry, timeout); + }, + retry, + timeout, + ); } export async function lockAndCallback Promise>( diff --git a/packages/memory-storage/src/background-handler/index.ts b/packages/memory-storage/src/background-handler/index.ts index 94734d02e77c..4a9f8eb3b9fe 100644 --- a/packages/memory-storage/src/background-handler/index.ts +++ b/packages/memory-storage/src/background-handler/index.ts @@ -8,10 +8,13 @@ import type { BackgroundHandlerReceivedMessage } from '../utils'; * This is used in MemoryStorage#teardown to wait for all tasks to finish executing before exiting the process. * @internal */ -export const promiseMap: Map; - resolve: () => void; -}> = new Map(); +export const promiseMap: Map< + string, + { + promise: Promise; + resolve: () => void; + } +> = new Map(); export function scheduleBackgroundTask(message: BackgroundHandlerReceivedMessage) { const id = randomUUID(); diff --git a/packages/memory-storage/src/body-parser.ts b/packages/memory-storage/src/body-parser.ts index 1667a79f7aa3..92c6ce1726c4 100644 --- a/packages/memory-storage/src/body-parser.ts +++ b/packages/memory-storage/src/body-parser.ts @@ -2,11 +2,7 @@ import contentTypeParser from 'content-type'; import JSON5 from 'json5'; const CONTENT_TYPE_JSON = 'application/json'; -const STRINGIFIABLE_CONTENT_TYPE_RXS = [ - new RegExp(`^${CONTENT_TYPE_JSON}$`, 'i'), - /^application\/.*xml$/i, - /^text\//i, -]; +const STRINGIFIABLE_CONTENT_TYPE_RXS = [new RegExp(`^${CONTENT_TYPE_JSON}$`, 'i'), /^application\/.*xml$/i, /^text\//i]; /** * Parses a Buffer or ArrayBuffer using the provided content type header. @@ -18,7 +14,10 @@ const STRINGIFIABLE_CONTENT_TYPE_RXS = [ * If the header includes a charset, the body will be stringified only * if the charset represents a known encoding to Node.js or Browser. */ -export function maybeParseBody(body: Buffer | ArrayBuffer, contentTypeHeader: string): string | Buffer | ArrayBuffer | Record { +export function maybeParseBody( + body: Buffer | ArrayBuffer, + contentTypeHeader: string, +): string | Buffer | ArrayBuffer | Record { let contentType: string; let charset: BufferEncoding; try { @@ -35,9 +34,7 @@ export function maybeParseBody(body: Buffer | ArrayBuffer, contentTypeHeader: st if (!areDataStringifiable(contentType, charset)) return body; const dataString = isomorphicBufferToString(body, charset); - return contentType === CONTENT_TYPE_JSON - ? JSON5.parse(dataString) - : dataString; + return contentType === CONTENT_TYPE_JSON ? JSON5.parse(dataString) : dataString; } function isomorphicBufferToString(buffer: Buffer | ArrayBuffer, encoding: BufferEncoding): string { diff --git a/packages/memory-storage/src/cache-helpers.ts b/packages/memory-storage/src/cache-helpers.ts index e441c5416f4d..75adcf69ab73 100644 --- a/packages/memory-storage/src/cache-helpers.ts +++ b/packages/memory-storage/src/cache-helpers.ts @@ -15,7 +15,9 @@ const uuidRegex = /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/ export async function findOrCacheDatasetByPossibleId(client: MemoryStorage, entryNameOrId: string) { // First check memory cache - const found = client.datasetClientsHandled.find((store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase()); + const found = client.datasetClientsHandled.find( + (store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase(), + ); if (found) { return found; @@ -99,7 +101,11 @@ export async function findOrCacheDatasetByPossibleId(client: MemoryStorage, entr for (const entryId of entries.values()) { // We create a file system entry instead of possibly making an in-memory one to allow the pre-included data to be used on demand - const entry = new DatasetFileSystemEntry({ storeDirectory: datasetDir, entityId: entryId, persistStorage: true }); + const entry = new DatasetFileSystemEntry({ + storeDirectory: datasetDir, + entityId: entryId, + persistStorage: true, + }); // eslint-disable-next-line dot-notation newClient['datasetEntries'].set(entryId, entry); @@ -112,7 +118,9 @@ export async function findOrCacheDatasetByPossibleId(client: MemoryStorage, entr export async function findOrCacheKeyValueStoreByPossibleId(client: MemoryStorage, entryNameOrId: string) { // First check memory cache - const found = client.keyValueStoresHandled.find((store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase()); + const found = client.keyValueStoresHandled.find( + (store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase(), + ); if (found) { return found; @@ -239,7 +247,11 @@ export async function findOrCacheKeyValueStoreByPossibleId(client: MemoryStorage for (const [key, record] of internalRecords) { // We create a file system entry instead of possibly making an in-memory one to allow the pre-included data to be used on demand - const entry = new KeyValueFileSystemEntry({ persistStorage: true, storeDirectory: keyValueStoreDir, writeMetadata: hasSeenMetadataForEntry }); + const entry = new KeyValueFileSystemEntry({ + persistStorage: true, + storeDirectory: keyValueStoreDir, + writeMetadata: hasSeenMetadataForEntry, + }); // eslint-disable-next-line dot-notation entry['rawRecord'] = { ...record }; @@ -259,7 +271,9 @@ export async function findOrCacheKeyValueStoreByPossibleId(client: MemoryStorage export async function findRequestQueueByPossibleId(client: MemoryStorage, entryNameOrId: string) { // First check memory cache - const found = client.requestQueuesHandled.find((store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase()); + const found = client.requestQueuesHandled.find( + (store) => store.id === entryNameOrId || store.name?.toLowerCase() === entryNameOrId.toLowerCase(), + ); if (found) { return found; diff --git a/packages/memory-storage/src/fs/dataset/index.ts b/packages/memory-storage/src/fs/dataset/index.ts index 636a80545cc0..87a39885b774 100644 --- a/packages/memory-storage/src/fs/dataset/index.ts +++ b/packages/memory-storage/src/fs/dataset/index.ts @@ -4,7 +4,9 @@ import { DatasetFileSystemEntry } from './fs'; import { DatasetMemoryEntry } from './memory'; import type { StorageImplementation } from '../common'; -export function createDatasetStorageImplementation(options: CreateStorageImplementationOptions): StorageImplementation { +export function createDatasetStorageImplementation( + options: CreateStorageImplementationOptions, +): StorageImplementation { if (options.persistStorage) { return new DatasetFileSystemEntry(options); } diff --git a/packages/memory-storage/src/fs/key-value-store/fs.ts b/packages/memory-storage/src/fs/key-value-store/fs.ts index 3a9443a347bb..25630a27ca8a 100644 --- a/packages/memory-storage/src/fs/key-value-store/fs.ts +++ b/packages/memory-storage/src/fs/key-value-store/fs.ts @@ -36,10 +36,14 @@ export class KeyValueFileSystemEntry implements StorageImplementation { +export function createKeyValueStorageImplementation( + options: CreateStorageImplementationOptions, +): StorageImplementation { if (options.persistStorage) { return new KeyValueFileSystemEntry(options); } diff --git a/packages/memory-storage/src/fs/request-queue/index.ts b/packages/memory-storage/src/fs/request-queue/index.ts index f15b6bd892d9..7304afe6a232 100644 --- a/packages/memory-storage/src/fs/request-queue/index.ts +++ b/packages/memory-storage/src/fs/request-queue/index.ts @@ -3,7 +3,9 @@ import { RequestQueueMemoryEntry } from './memory'; import type { InternalRequest } from '../../resource-clients/request-queue'; import type { StorageImplementation } from '../common'; -export function createRequestQueueStorageImplementation(options: CreateStorageImplementationOptions): StorageImplementation { +export function createRequestQueueStorageImplementation( + options: CreateStorageImplementationOptions, +): StorageImplementation { if (options.persistStorage) { return new RequestQueueFileSystemEntry(options); } diff --git a/packages/memory-storage/src/memory-storage.ts b/packages/memory-storage/src/memory-storage.ts index 2388e5c8c07e..60c9b35693fd 100644 --- a/packages/memory-storage/src/memory-storage.ts +++ b/packages/memory-storage/src/memory-storage.ts @@ -71,9 +71,16 @@ export class MemoryStorage implements storage.StorageClient { this.datasetsDirectory = resolve(this.localDataDirectory, 'datasets'); this.keyValueStoresDirectory = resolve(this.localDataDirectory, 'key_value_stores'); this.requestQueuesDirectory = resolve(this.localDataDirectory, 'request_queues'); - this.writeMetadata = options.writeMetadata ?? process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false; - this.persistStorage = options.persistStorage - ?? (process.env.CRAWLEE_PERSIST_STORAGE ? !['false', '0', ''].includes(process.env.CRAWLEE_PERSIST_STORAGE!) : true); + this.writeMetadata = + options.writeMetadata ?? + process.env.DEBUG?.includes('*') ?? + process.env.DEBUG?.includes('crawlee:memory-storage') ?? + false; + this.persistStorage = + options.persistStorage ?? + (process.env.CRAWLEE_PERSIST_STORAGE + ? !['false', '0', ''].includes(process.env.CRAWLEE_PERSIST_STORAGE!) + : true); } datasets(): storage.DatasetCollectionClient { @@ -116,7 +123,12 @@ export class MemoryStorage implements storage.StorageClient { timeoutSecs: s.number.optional, }).parse(options); - return new RequestQueueClient({ id, baseStorageDirectory: this.requestQueuesDirectory, client: this, ...options }); + return new RequestQueueClient({ + id, + baseStorageDirectory: this.requestQueuesDirectory, + client: this, + ...options, + }); } async setStatusMessage(message: string, options: storage.SetStatusMessageOptions = {}): Promise { @@ -141,9 +153,13 @@ export class MemoryStorage implements storage.StorageClient { for (const keyValueStoreFolder of keyValueStores) { if (keyValueStoreFolder.startsWith('__CRAWLEE_TEMPORARY') || keyValueStoreFolder.startsWith('__OLD')) { - keyValueStorePromises.push((await this.batchRemoveFiles(resolve(this.keyValueStoresDirectory, keyValueStoreFolder)))()); + keyValueStorePromises.push( + (await this.batchRemoveFiles(resolve(this.keyValueStoresDirectory, keyValueStoreFolder)))(), + ); } else if (keyValueStoreFolder === 'default') { - keyValueStorePromises.push(this.handleDefaultKeyValueStore(resolve(this.keyValueStoresDirectory, keyValueStoreFolder))()); + keyValueStorePromises.push( + this.handleDefaultKeyValueStore(resolve(this.keyValueStoresDirectory, keyValueStoreFolder))(), + ); } } @@ -167,7 +183,9 @@ export class MemoryStorage implements storage.StorageClient { for (const requestQueueFolder of requestQueues) { if (requestQueueFolder === 'default' || requestQueueFolder.startsWith('__CRAWLEE_TEMPORARY')) { - requestQueuePromises.push((await this.batchRemoveFiles(resolve(this.requestQueuesDirectory, requestQueueFolder)))()); + requestQueuePromises.push( + (await this.batchRemoveFiles(resolve(this.requestQueuesDirectory, requestQueueFolder)))(), + ); } } @@ -188,12 +206,7 @@ export class MemoryStorage implements storage.StorageClient { const temporaryPath = resolve(folder, '../__CRAWLEE_MIGRATING_KEY_VALUE_STORE__'); // For optimization, we want to only attempt to copy a few files from the default key-value store - const possibleInputKeys = [ - 'INPUT', - 'INPUT.json', - 'INPUT.bin', - 'INPUT.txt', - ]; + const possibleInputKeys = ['INPUT', 'INPUT.json', 'INPUT.bin', 'INPUT.txt']; if (storagePathExists) { // Create temporary folder to save important files in diff --git a/packages/memory-storage/src/resource-clients/dataset-collection.ts b/packages/memory-storage/src/resource-clients/dataset-collection.ts index 5edeaf0e40ce..606f0462c083 100644 --- a/packages/memory-storage/src/resource-clients/dataset-collection.ts +++ b/packages/memory-storage/src/resource-clients/dataset-collection.ts @@ -29,8 +29,8 @@ export class DatasetCollectionClient implements storage.DatasetCollectionClient offset: 0, limit: this.client.datasetClientsHandled.length, desc: false, - items: this.client.datasetClientsHandled.map( - (store) => store.toDatasetInfo()) + items: this.client.datasetClientsHandled + .map((store) => store.toDatasetInfo()) .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()), }; } diff --git a/packages/memory-storage/src/resource-clients/dataset.ts b/packages/memory-storage/src/resource-clients/dataset.ts index 9911aa5d3294..20496ccec4db 100644 --- a/packages/memory-storage/src/resource-clients/dataset.ts +++ b/packages/memory-storage/src/resource-clients/dataset.ts @@ -35,7 +35,10 @@ export interface DatasetClientOptions { client: MemoryStorage; } -export class DatasetClient extends BaseClient implements storage.DatasetClient { +export class DatasetClient + extends BaseClient + implements storage.DatasetClient +{ name?: string; createdAt = new Date(); accessedAt = new Date(); @@ -65,9 +68,11 @@ export class DatasetClient extends BaseCli } async update(newFields: storage.DatasetClientUpdateOptions = {}): Promise { - const parsed = s.object({ - name: s.string.lengthGreaterThan(0).optional, - }).parse(newFields); + const parsed = s + .object({ + name: s.string.lengthGreaterThan(0).optional, + }) + .parse(newFields); // Check by id const existingStoreById = await findOrCacheDatasetByPossibleId(this.client, this.name ?? this.id); @@ -82,7 +87,9 @@ export class DatasetClient extends BaseCli } // Check that name is not in use already - const existingStoreByName = this.client.datasetClientsHandled.find((store) => store.name?.toLowerCase() === parsed.name!.toLowerCase()); + const existingStoreByName = this.client.datasetClientsHandled.find( + (store) => store.name?.toLowerCase() === parsed.name!.toLowerCase(), + ); if (existingStoreByName) { this.throwOnDuplicateEntry(StorageTypes.Dataset, 'name', parsed.name); @@ -92,7 +99,10 @@ export class DatasetClient extends BaseCli const previousDir = existingStoreById.datasetDirectory; - existingStoreById.datasetDirectory = resolve(this.client.datasetsDirectory, parsed.name ?? existingStoreById.name ?? existingStoreById.id); + existingStoreById.datasetDirectory = resolve( + this.client.datasetsDirectory, + parsed.name ?? existingStoreById.name ?? existingStoreById.id, + ); await move(previousDir, existingStoreById.datasetDirectory, { overwrite: true }); @@ -123,11 +133,13 @@ export class DatasetClient extends BaseCli limit = LIST_ITEMS_LIMIT, offset = 0, desc, - } = s.object({ - desc: s.boolean.optional, - limit: s.number.int.optional, - offset: s.number.int.optional, - }).parse(options); + } = s + .object({ + desc: s.boolean.optional, + limit: s.number.int.optional, + offset: s.number.int.optional, + }) + .parse(options); // Check by id const existingStoreById = await findOrCacheDatasetByPossibleId(this.client, this.name ?? this.id); @@ -161,11 +173,13 @@ export class DatasetClient extends BaseCli } async pushItems(items: string | Data | string[] | Data[]): Promise { - const rawItems = s.union( - s.string, - s.object({} as Data).passthrough, - s.array(s.union(s.string, s.object({} as Data).passthrough)), - ).parse(items) as Data[]; + const rawItems = s + .union( + s.string, + s.object({} as Data).passthrough, + s.array(s.union(s.string, s.object({} as Data).passthrough)), + ) + .parse(items) as Data[]; // Check by id const existingStoreById = await findOrCacheDatasetByPossibleId(this.client, this.name ?? this.id); @@ -228,9 +242,7 @@ export class DatasetClient extends BaseCli items = JSON.parse(items); } - return Array.isArray(items) - ? items.map((item) => this.normalizeItem(item)) - : [this.normalizeItem(items)]; + return Array.isArray(items) ? items.map((item) => this.normalizeItem(item)) : [this.normalizeItem(items)]; } private normalizeItem(item: string | Data): Data { @@ -239,7 +251,9 @@ export class DatasetClient extends BaseCli } if (Array.isArray(item)) { - throw new Error(`Each dataset item can only be a single JSON object, not an array. Received: [${item.join(',\n')}]`); + throw new Error( + `Each dataset item can only be a single JSON object, not an array. Received: [${item.join(',\n')}]`, + ); } if (typeof item !== 'object' || item === null) { diff --git a/packages/memory-storage/src/resource-clients/key-value-store-collection.ts b/packages/memory-storage/src/resource-clients/key-value-store-collection.ts index edfaea63b41a..95086c5e97b4 100644 --- a/packages/memory-storage/src/resource-clients/key-value-store-collection.ts +++ b/packages/memory-storage/src/resource-clients/key-value-store-collection.ts @@ -29,8 +29,8 @@ export class KeyValueStoreCollectionClient implements storage.KeyValueStoreColle offset: 0, limit: this.client.keyValueStoresHandled.length, desc: false, - items: this.client.keyValueStoresHandled.map( - (store) => store.toKeyValueStoreInfo()) + items: this.client.keyValueStoresHandled + .map((store) => store.toKeyValueStoreInfo()) .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()), }; } @@ -46,7 +46,11 @@ export class KeyValueStoreCollectionClient implements storage.KeyValueStoreColle } } - const newStore = new KeyValueStoreClient({ name, baseStorageDirectory: this.keyValueStoresDirectory, client: this.client }); + const newStore = new KeyValueStoreClient({ + name, + baseStorageDirectory: this.keyValueStoresDirectory, + client: this.client, + }); this.client.keyValueStoresHandled.push(newStore); // Schedule the worker to write to the disk diff --git a/packages/memory-storage/src/resource-clients/key-value-store.ts b/packages/memory-storage/src/resource-clients/key-value-store.ts index ef34b1c019e4..2daa3cbd53b2 100644 --- a/packages/memory-storage/src/resource-clients/key-value-store.ts +++ b/packages/memory-storage/src/resource-clients/key-value-store.ts @@ -63,9 +63,11 @@ export class KeyValueStoreClient extends BaseClient { } async update(newFields: storage.KeyValueStoreClientUpdateOptions = {}): Promise { - const parsed = s.object({ - name: s.string.lengthGreaterThan(0).optional, - }).parse(newFields); + const parsed = s + .object({ + name: s.string.lengthGreaterThan(0).optional, + }) + .parse(newFields); // Check by id const existingStoreById = await findOrCacheKeyValueStoreByPossibleId(this.client, this.name ?? this.id); @@ -80,7 +82,9 @@ export class KeyValueStoreClient extends BaseClient { } // Check that name is not in use already - const existingStoreByName = this.client.keyValueStoresHandled.find((store) => store.name?.toLowerCase() === parsed.name!.toLowerCase()); + const existingStoreByName = this.client.keyValueStoresHandled.find( + (store) => store.name?.toLowerCase() === parsed.name!.toLowerCase(), + ); if (existingStoreByName) { this.throwOnDuplicateEntry(StorageTypes.KeyValueStore, 'name', parsed.name); @@ -90,7 +94,10 @@ export class KeyValueStoreClient extends BaseClient { const previousDir = existingStoreById.keyValueStoreDirectory; - existingStoreById.keyValueStoreDirectory = resolve(this.client.keyValueStoresDirectory, parsed.name ?? existingStoreById.name ?? existingStoreById.id); + existingStoreById.keyValueStoreDirectory = resolve( + this.client.keyValueStoresDirectory, + parsed.name ?? existingStoreById.name ?? existingStoreById.id, + ); await move(previousDir, existingStoreById.keyValueStoreDirectory, { overwrite: true }); @@ -112,13 +119,12 @@ export class KeyValueStoreClient extends BaseClient { } async listKeys(options: storage.KeyValueStoreClientListOptions = {}): Promise { - const { - limit = DEFAULT_API_PARAM_LIMIT, - exclusiveStartKey, - } = s.object({ - limit: s.number.greaterThan(0).optional, - exclusiveStartKey: s.string.optional, - }).parse(options); + const { limit = DEFAULT_API_PARAM_LIMIT, exclusiveStartKey } = s + .object({ + limit: s.number.greaterThan(0).optional, + exclusiveStartKey: s.string.optional, + }) + .parse(options); // Check by id const existingStoreById = await findOrCacheKeyValueStoreByPossibleId(this.client, this.name ?? this.id); @@ -156,9 +162,7 @@ export class KeyValueStoreClient extends BaseClient { const lastItemInStore = items[items.length - 1]; const lastSelectedItem = limitedItems[limitedItems.length - 1]; const isLastSelectedItemAbsolutelyLast = lastItemInStore === lastSelectedItem; - const nextExclusiveStartKey = isLastSelectedItemAbsolutelyLast - ? undefined - : lastSelectedItem.key; + const nextExclusiveStartKey = isLastSelectedItemAbsolutelyLast ? undefined : lastSelectedItem.key; existingStoreById.updateTimestamps(false); @@ -191,7 +195,10 @@ export class KeyValueStoreClient extends BaseClient { return existingStoreById.keyValueEntries.has(key); } - async getRecord(key: string, options: storage.KeyValueStoreClientGetRecordOptions = {}): Promise { + async getRecord( + key: string, + options: storage.KeyValueStoreClientGetRecordOptions = {}, + ): Promise { s.string.parse(key); s.object({ buffer: s.boolean.optional, @@ -219,7 +226,7 @@ export class KeyValueStoreClient extends BaseClient { const record: storage.KeyValueStoreRecord = { key: entry.key, value: entry.value, - contentType: entry.contentType ?? mime.contentType(entry.extension) as string, + contentType: entry.contentType ?? (mime.contentType(entry.extension) as string), }; if (options.stream) { @@ -246,7 +253,9 @@ export class KeyValueStoreClient extends BaseClient { s.instance(ArrayBuffer), s.typedArray(), // disabling validation will make shapeshift only check the object given is an actual object, not null, nor array - s.object({}).setValidationEnabled(false), + s + .object({}) + .setValidationEnabled(false), ), contentType: s.string.lengthGreaterThan(0).optional, }).parse(record); diff --git a/packages/memory-storage/src/resource-clients/request-queue-collection.ts b/packages/memory-storage/src/resource-clients/request-queue-collection.ts index 71df99c1d1ef..290867bd3811 100644 --- a/packages/memory-storage/src/resource-clients/request-queue-collection.ts +++ b/packages/memory-storage/src/resource-clients/request-queue-collection.ts @@ -29,8 +29,8 @@ export class RequestQueueCollectionClient implements storage.RequestQueueCollect offset: 0, limit: this.client.requestQueuesHandled.length, desc: false, - items: this.client.requestQueuesHandled.map( - (store) => store.toRequestQueueInfo()) + items: this.client.requestQueuesHandled + .map((store) => store.toRequestQueueInfo()) .sort((a, b) => a.createdAt.getTime() - b.createdAt.getTime()), }; } @@ -46,7 +46,11 @@ export class RequestQueueCollectionClient implements storage.RequestQueueCollect } } - const newStore = new RequestQueueClient({ name, baseStorageDirectory: this.requestQueuesDirectory, client: this.client }); + const newStore = new RequestQueueClient({ + name, + baseStorageDirectory: this.requestQueuesDirectory, + client: this.client, + }); this.client.requestQueuesHandled.push(newStore); // Schedule the worker to write to the disk diff --git a/packages/memory-storage/src/resource-clients/request-queue.ts b/packages/memory-storage/src/resource-clients/request-queue.ts index 412ec4441290..2191316c7602 100644 --- a/packages/memory-storage/src/resource-clients/request-queue.ts +++ b/packages/memory-storage/src/resource-clients/request-queue.ts @@ -70,7 +70,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue this.client = options.client; } - private async getQueue() : Promise { + private async getQueue(): Promise { const existingQueueById = await findRequestQueueByPossibleId(this.client, this.name ?? this.id); if (!existingQueueById) { @@ -96,9 +96,11 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue async update(newFields: { name?: string | undefined }): Promise { // The validation is intentionally loose to prevent issues // when swapping to a remote queue in production. - const parsed = s.object({ - name: s.string.lengthGreaterThan(0).optional, - }).passthrough.parse(newFields); + const parsed = s + .object({ + name: s.string.lengthGreaterThan(0).optional, + }) + .passthrough.parse(newFields); const existingQueueById = await findRequestQueueByPossibleId(this.client, this.name ?? this.id); @@ -112,7 +114,9 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue } // Check that name is not in use already - const existingQueueByName = this.client.requestQueuesHandled.find((queue) => queue.name?.toLowerCase() === parsed.name!.toLowerCase()); + const existingQueueByName = this.client.requestQueuesHandled.find( + (queue) => queue.name?.toLowerCase() === parsed.name!.toLowerCase(), + ); if (existingQueueByName) { this.throwOnDuplicateEntry(StorageTypes.RequestQueue, 'name', parsed.name); @@ -122,7 +126,10 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue const previousDir = existingQueueById.requestQueueDirectory; - existingQueueById.requestQueueDirectory = resolve(this.client.requestQueuesDirectory, parsed.name ?? existingQueueById.name ?? existingQueueById.id); + existingQueueById.requestQueueDirectory = resolve( + this.client.requestQueuesDirectory, + parsed.name ?? existingQueueById.name ?? existingQueueById.id, + ); await move(previousDir, existingQueueById.requestQueueDirectory, { overwrite: true }); @@ -145,9 +152,11 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue } async listHead(options: storage.ListOptions = {}): Promise { - const { limit } = s.object({ - limit: s.number.optional.default(100), - }).parse(options); + const { limit } = s + .object({ + limit: s.number.optional.default(100), + }) + .parse(options); const existingQueueById = await findRequestQueueByPossibleId(this.client, this.name ?? this.id); @@ -180,15 +189,18 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue } async listAndLockHead(options: storage.ListAndLockOptions): Promise { - const { limit, lockSecs } = s.object({ - limit: s.number.lessThanOrEqual(25).optional.default(25), - lockSecs: s.number, - }).parse(options); + const { limit, lockSecs } = s + .object({ + limit: s.number.lessThanOrEqual(25).optional.default(25), + lockSecs: s.number, + }) + .parse(options); const queue = await this.getQueue(); const start = Date.now(); - const isLocked = (request: InternalRequest) => !request.orderNo || request.orderNo > start || request.orderNo < -start; + const isLocked = (request: InternalRequest) => + !request.orderNo || request.orderNo > start || request.orderNo < -start; const items = []; @@ -225,12 +237,17 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue } } - async prolongRequestLock(id: string, options: storage.ProlongRequestLockOptions) : Promise { + async prolongRequestLock( + id: string, + options: storage.ProlongRequestLockOptions, + ): Promise { s.string.parse(id); - const { lockSecs, forefront } = s.object({ - lockSecs: s.number, - forefront: s.boolean.optional.default(false), - }).parse(options); + const { lockSecs, forefront } = s + .object({ + lockSecs: s.number, + forefront: s.boolean.optional.default(false), + }) + .parse(options); const queue = await this.getQueue(); const request = queue.requests.get(id); @@ -257,11 +274,13 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue }; } - async deleteRequestLock(id: string, options: storage.DeleteRequestLockOptions = {}) : Promise { + async deleteRequestLock(id: string, options: storage.DeleteRequestLockOptions = {}): Promise { s.string.parse(id); - const { forefront } = s.object({ - forefront: s.boolean.optional.default(false), - }).parse(options); + const { forefront } = s + .object({ + forefront: s.boolean.optional.default(false), + }) + .parse(options); const queue = await this.getQueue(); const request = queue.requests.get(id); @@ -284,7 +303,10 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue await request?.update(internalRequest); } - async addRequest(request: storage.RequestSchema, options: storage.RequestOptions = {}): Promise { + async addRequest( + request: storage.RequestSchema, + options: storage.RequestOptions = {}, + ): Promise { requestShapeWithoutId.parse(request); requestOptionsShape.parse(options); @@ -336,7 +358,10 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue }; } - async batchAddRequests(requests: storage.RequestSchema[], options: storage.RequestOptions = {}): Promise { + async batchAddRequests( + requests: storage.RequestSchema[], + options: storage.RequestOptions = {}, + ): Promise { batchRequestShapeWithoutId.parse(requests); requestOptionsShape.parse(options); @@ -407,7 +432,10 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue return this._jsonToRequest(json); } - async updateRequest(request: storage.UpdateRequestSchema, options: storage.RequestOptions = {}): Promise { + async updateRequest( + request: storage.UpdateRequestSchema, + options: storage.RequestOptions = {}, + ): Promise { requestShape.parse(request); requestOptionsShape.parse(options); diff --git a/packages/memory-storage/src/utils.ts b/packages/memory-storage/src/utils.ts index ddbc54df94a8..d372aa2e450d 100644 --- a/packages/memory-storage/src/utils.ts +++ b/packages/memory-storage/src/utils.ts @@ -34,11 +34,7 @@ export function uniqueKeyToRequestId(uniqueKey: string): string { export function isBuffer(value: unknown): boolean { try { - s.union( - s.instance(Buffer), - s.instance(ArrayBuffer), - s.typedArray(), - ).parse(value); + s.union(s.instance(Buffer), s.instance(ArrayBuffer), s.typedArray()).parse(value); return true; } catch { @@ -47,7 +43,11 @@ export function isBuffer(value: unknown): boolean { } export function isStream(value: any): boolean { - return typeof value === 'object' && value && ['on', 'pipe'].every((key) => key in value && typeof value[key] === 'function'); + return ( + typeof value === 'object' && + value && + ['on', 'pipe'].every((key) => key in value && typeof value[key] === 'function') + ); } export const memoryStorageLog = defaultLog.child({ prefix: 'MemoryStorage' }); diff --git a/packages/memory-storage/test/fs-fallback.test.ts b/packages/memory-storage/test/fs-fallback.test.ts index f1f982b6c9b8..1f014d936bc1 100644 --- a/packages/memory-storage/test/fs-fallback.test.ts +++ b/packages/memory-storage/test/fs-fallback.test.ts @@ -17,20 +17,32 @@ describe('fallback to fs for reading', () => { beforeAll(async () => { // Create "default" key-value store and give it an entry await ensureDir(resolve(storage.keyValueStoresDirectory, 'default')); - await writeFile(resolve(storage.keyValueStoresDirectory, 'default/__metadata__.json'), JSON.stringify({ - id: randomUUID(), - name: 'default', - createdAt: expectedFsDate, - accessedAt: expectedFsDate, - modifiedAt: expectedFsDate, - })); - await writeFile(resolve(storage.keyValueStoresDirectory, 'default/INPUT.json'), JSON.stringify({ foo: 'bar but from fs' })); + await writeFile( + resolve(storage.keyValueStoresDirectory, 'default/__metadata__.json'), + JSON.stringify({ + id: randomUUID(), + name: 'default', + createdAt: expectedFsDate, + accessedAt: expectedFsDate, + modifiedAt: expectedFsDate, + }), + ); + await writeFile( + resolve(storage.keyValueStoresDirectory, 'default/INPUT.json'), + JSON.stringify({ foo: 'bar but from fs' }), + ); await ensureDir(resolve(storage.keyValueStoresDirectory, 'other')); - await writeFile(resolve(storage.keyValueStoresDirectory, 'other/INPUT.json'), JSON.stringify({ foo: 'bar but from fs' })); + await writeFile( + resolve(storage.keyValueStoresDirectory, 'other/INPUT.json'), + JSON.stringify({ foo: 'bar but from fs' }), + ); await ensureDir(resolve(storage.keyValueStoresDirectory, 'no-ext')); - await writeFile(resolve(storage.keyValueStoresDirectory, 'no-ext/INPUT'), JSON.stringify({ foo: 'bar but from fs' })); + await writeFile( + resolve(storage.keyValueStoresDirectory, 'no-ext/INPUT'), + JSON.stringify({ foo: 'bar but from fs' }), + ); await ensureDir(resolve(storage.keyValueStoresDirectory, 'invalid-json')); await writeFile(resolve(storage.keyValueStoresDirectory, 'invalid-json/INPUT.json'), '{'); @@ -57,19 +69,16 @@ describe('fallback to fs for reading', () => { }); }); - test( - 'attempting to read "other" key value store with no "__metadata__" present should read from fs, even if accessed without generating id first', - async () => { - const otherStore = storage.keyValueStore('other'); - - const input = await otherStore.getRecord('INPUT'); - expect(input).toStrictEqual({ - key: 'INPUT', - value: { foo: 'bar but from fs' }, - contentType: 'application/json; charset=utf-8', - }); - }, - ); + test('attempting to read "other" key value store with no "__metadata__" present should read from fs, even if accessed without generating id first', async () => { + const otherStore = storage.keyValueStore('other'); + + const input = await otherStore.getRecord('INPUT'); + expect(input).toStrictEqual({ + key: 'INPUT', + value: { foo: 'bar but from fs' }, + contentType: 'application/json; charset=utf-8', + }); + }); test('attempting to read non-existent "default_2" key value store should return undefined', async () => { await expect(storage.keyValueStore('default_2').get()).resolves.toBeUndefined(); diff --git a/packages/memory-storage/test/no-crash-on-big-buffers.test.ts b/packages/memory-storage/test/no-crash-on-big-buffers.test.ts index fe5a896b2a7b..67ae80fc2676 100644 --- a/packages/memory-storage/test/no-crash-on-big-buffers.test.ts +++ b/packages/memory-storage/test/no-crash-on-big-buffers.test.ts @@ -30,7 +30,7 @@ describe('MemoryStorage should not crash when saving a big buffer', () => { let zip: Buffer; if (process.env.CRAWLEE_DIFFICULT_TESTS) { - const numbers = Array.from(([...Array(18_100_000).keys()]).map((i) => i * 3_000_000)); + const numbers = Array.from([...Array(18_100_000).keys()].map((i) => i * 3_000_000)); zip = Buffer.from([...numbers]); } else { diff --git a/packages/memory-storage/test/no-writing-to-disk.test.ts b/packages/memory-storage/test/no-writing-to-disk.test.ts index 272aea1b4920..e39fb7c9c46a 100644 --- a/packages/memory-storage/test/no-writing-to-disk.test.ts +++ b/packages/memory-storage/test/no-writing-to-disk.test.ts @@ -51,9 +51,7 @@ describe('persistStorage option', () => { const directoryFiles = await readdir(storePath); expect(directoryFiles).toHaveLength(1); - expect(directoryFiles).toEqual([ - '__metadata__.json', - ]); + expect(directoryFiles).toEqual(['__metadata__.json']); }); }); }); diff --git a/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts b/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts index 2cdce4e852c6..e777cbca3665 100644 --- a/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts +++ b/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts @@ -28,14 +28,22 @@ describe('RequestQueue handledRequestCount should update', () => { }); test('adding an already handled request should increment the handledRequestCount', async () => { - await requestQueue.addRequest({ url: 'http://example.com/2', uniqueKey: '2', handledAt: new Date().toISOString() }); + await requestQueue.addRequest({ + url: 'http://example.com/2', + uniqueKey: '2', + handledAt: new Date().toISOString(), + }); const updatedStatistics = await requestQueue.get(); expect(updatedStatistics?.handledRequestCount).toEqual(2); }); test('deleting a request should decrement the handledRequestCount', async () => { - const { requestId } = await requestQueue.addRequest({ url: 'http://example.com/3', uniqueKey: '3', handledAt: new Date().toISOString() }); + const { requestId } = await requestQueue.addRequest({ + url: 'http://example.com/3', + uniqueKey: '3', + handledAt: new Date().toISOString(), + }); await requestQueue.deleteRequest(requestId); diff --git a/packages/memory-storage/test/request-queue/ignore-non-json-files.test.ts b/packages/memory-storage/test/request-queue/ignore-non-json-files.test.ts index 1cef2fba6772..a6ed41736da2 100644 --- a/packages/memory-storage/test/request-queue/ignore-non-json-files.test.ts +++ b/packages/memory-storage/test/request-queue/ignore-non-json-files.test.ts @@ -16,27 +16,33 @@ describe('when falling back to fs, Request queue should ignore non-JSON files', beforeAll(async () => { // Create "default" request queue and give it faulty entries await ensureDir(resolve(storage.requestQueuesDirectory, 'default')); - await writeFile(resolve(storage.requestQueuesDirectory, 'default/__metadata__.json'), JSON.stringify({ - id: randomUUID(), - name: 'default', - createdAt: new Date(2022, 0, 1), - accessedAt: new Date(2022, 0, 1), - modifiedAt: new Date(2022, 0, 1), - })); - - await writeFile(resolve(storage.requestQueuesDirectory, 'default/123.json'), JSON.stringify({ - id: '123', - orderNo: 1, - url: 'http://example.com', - uniqueKey: 'owo', - method: 'GET', - retryCount: 0, - json: JSON.stringify({ - uniqueKey: 'owo', - url: 'http://example.com', + await writeFile( + resolve(storage.requestQueuesDirectory, 'default/__metadata__.json'), + JSON.stringify({ + id: randomUUID(), + name: 'default', + createdAt: new Date(2022, 0, 1), + accessedAt: new Date(2022, 0, 1), + modifiedAt: new Date(2022, 0, 1), + }), + ); + + await writeFile( + resolve(storage.requestQueuesDirectory, 'default/123.json'), + JSON.stringify({ id: '123', - } satisfies RequestSchema), - } satisfies InternalRequest)); + orderNo: 1, + url: 'http://example.com', + uniqueKey: 'owo', + method: 'GET', + retryCount: 0, + json: JSON.stringify({ + uniqueKey: 'owo', + url: 'http://example.com', + id: '123', + } satisfies RequestSchema), + } satisfies InternalRequest), + ); await writeFile(resolve(storage.requestQueuesDirectory, 'default/.DS_Store'), 'owo'); await writeFile(resolve(storage.requestQueuesDirectory, 'default/invalid.txt'), 'owo'); diff --git a/packages/memory-storage/test/write-metadata.test.ts b/packages/memory-storage/test/write-metadata.test.ts index bd70f1691f80..eb36325950e9 100644 --- a/packages/memory-storage/test/write-metadata.test.ts +++ b/packages/memory-storage/test/write-metadata.test.ts @@ -66,7 +66,10 @@ describe('writeMetadata option', () => { await keyValueStore.setRecord({ key: 'foo', value: 'test' }); const expectedFilePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.txt`); - const expectedMetadataPath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.__metadata__.json`); + const expectedMetadataPath = resolve( + storage.keyValueStoresDirectory, + `${keyValueStoreInfo.id}/foo.__metadata__.json`, + ); await Promise.all([waitTillWrittenToDisk(expectedFilePath), waitTillWrittenToDisk(expectedMetadataPath)]); const directoryFiles = await readdir(resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}`)); diff --git a/packages/memory-storage/tsconfig.build.json b/packages/memory-storage/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/memory-storage/tsconfig.build.json +++ b/packages/memory-storage/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 1fcc41617080..833b6cec25d2 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -11,7 +11,7 @@ import type { PlaywrightCrawlerOptions, PlaywrightCrawlingContext } from './play import { PlaywrightCrawler } from './playwright-crawler'; import { RenderingTypePredictor, type RenderingType } from './utils/rendering-type-prediction'; -type Result = { result: TResult; ok: true } | { error: unknown; ok: false } +type Result = { result: TResult; ok: true } | { error: unknown; ok: false }; interface AdaptivePlaywrightCrawlerStatisticState extends StatisticState { httpOnlyRequestHandlerRuns?: number; @@ -37,7 +37,9 @@ class AdaptivePlaywrightCrawlerStatistics extends Statistics { protected override async _maybeLoadStatistics(): Promise { await super._maybeLoadStatistics(); - const savedState = await this.keyValueStore?.getValue(this.persistStateKey); + const savedState = await this.keyValueStore?.getValue( + this.persistStateKey, + ); if (!savedState) { return; @@ -71,7 +73,8 @@ interface AdaptivePlaywrightCrawlerContext extends RestrictedCrawlingContext { querySelector: (selector: string, timeoutMs?: number) => Awaitable>; } -export interface AdaptivePlaywrightCrawlerOptions extends Omit { +export interface AdaptivePlaywrightCrawlerOptions + extends Omit { /** * Function that is called to process each request. * @@ -161,7 +164,8 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { ) { super(options, config); this.adaptiveRequestHandler = requestHandler; - this.renderingTypePredictor = renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio }); + this.renderingTypePredictor = + renderingTypePredictor ?? new RenderingTypePredictor({ detectionRatio: renderingTypeDetectionRatio }); this.resultChecker = resultChecker ?? (() => true); if (resultComparator !== undefined) { @@ -170,11 +174,13 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { this.resultComparator = (resultA, resultB) => this.resultChecker(resultA) && this.resultChecker(resultB); } else { this.resultComparator = (resultA, resultB) => { - return resultA.datasetItems.length === resultB.datasetItems.length - && resultA.datasetItems.every((itemA, i) => { + return ( + resultA.datasetItems.length === resultB.datasetItems.length && + resultA.datasetItems.every((itemA, i) => { const itemB = resultB.datasetItems[i]; return isEqual(itemA, itemB); - }); + }) + ); }; } @@ -192,7 +198,9 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { const shouldDetectRenderingType = Math.random() < renderingTypePrediction.detectionProbabilityRecommendation; if (!shouldDetectRenderingType) { - crawlingContext.log.info(`Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`); + crawlingContext.log.info( + `Predicted rendering type ${renderingTypePrediction.renderingType} for ${crawlingContext.request.url}`, + ); } if (renderingTypePrediction.renderingType === 'static' && !shouldDetectRenderingType) { @@ -205,10 +213,16 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { crawlingContext.log.info(`HTTP-only request handler succeeded for ${crawlingContext.request.url}`); await this.commitResult(crawlingContext, plainHTTPRun.result); return; - } if (!plainHTTPRun.ok) { - crawlingContext.log.exception(plainHTTPRun.error as Error, `HTTP-only request handler failed for ${crawlingContext.request.url}`); + } + if (!plainHTTPRun.ok) { + crawlingContext.log.exception( + plainHTTPRun.error as Error, + `HTTP-only request handler failed for ${crawlingContext.request.url}`, + ); } else { - crawlingContext.log.warning(`HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`); + crawlingContext.log.warning( + `HTTP-only request handler returned a suspicious result for ${crawlingContext.request.url}`, + ); this.stats.trackRenderingTypeMisprediction(); } } @@ -255,20 +269,27 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { ...Object.entries(keyValueStoreChanges).map(async ([storeIdOrName, changes]) => { const store = await crawlingContext.getKeyValueStore(storeIdOrName); await Promise.all( - Object.entries(changes).map(async ([key, { changedValue, options }]) => store.setValue(key, changedValue, options)), + Object.entries(changes).map(async ([key, { changedValue, options }]) => + store.setValue(key, changedValue, options), + ), ); }), ]); } - protected allowStorageAccess(func: (...args: TArgs) => Promise): ((...args: TArgs) => Promise) { - return async (...args: TArgs) => withCheckedStorageAccess( - () => { }, - async () => func(...args), - ); + protected allowStorageAccess( + func: (...args: TArgs) => Promise, + ): (...args: TArgs) => Promise { + return async (...args: TArgs) => + withCheckedStorageAccess( + () => {}, + async () => func(...args), + ); } - protected async runRequestHandlerInBrowser(crawlingContext: PlaywrightCrawlingContext): Promise> { + protected async runRequestHandlerInBrowser( + crawlingContext: PlaywrightCrawlingContext, + ): Promise> { const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); try { @@ -276,36 +297,44 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { new Proxy(this, { get: (target, propertyName, receiver) => { if (propertyName === 'userProvidedRequestHandler') { - return (async (playwrightContext: PlaywrightCrawlingContext) => withCheckedStorageAccess( - () => { - throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler'); - }, - () => this.adaptiveRequestHandler({ - request: crawlingContext.request, - log: crawlingContext.log, - querySelector: async (selector, timeoutMs) => { - const locator = playwrightContext.page.locator(selector).first(); - await locator.waitFor({ timeout: timeoutMs }); - return (await playwrightContext.parseWithCheerio())(selector) as Cheerio; - }, - enqueueLinks: async (options = {}) => { - const selector = options.selector ?? 'a'; - const locator = playwrightContext.page.locator(selector).first(); - await locator.waitFor(); - - const urls = await extractUrlsFromPage( - playwrightContext.page, - selector, - options.baseUrl ?? playwrightContext.request.loadedUrl ?? playwrightContext.request.url, + return async (playwrightContext: PlaywrightCrawlingContext) => + withCheckedStorageAccess( + () => { + throw new Error( + 'Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler', ); - await result.enqueueLinks({ ...options, urls }); }, - addRequests: result.addRequests, - pushData: result.pushData, - useState: this.allowStorageAccess(result.useState), - getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), - }), - )); + () => + this.adaptiveRequestHandler({ + request: crawlingContext.request, + log: crawlingContext.log, + querySelector: async (selector, timeoutMs) => { + const locator = playwrightContext.page.locator(selector).first(); + await locator.waitFor({ timeout: timeoutMs }); + return (await playwrightContext.parseWithCheerio())( + selector, + ) as Cheerio; + }, + enqueueLinks: async (options = {}) => { + const selector = options.selector ?? 'a'; + const locator = playwrightContext.page.locator(selector).first(); + await locator.waitFor(); + + const urls = await extractUrlsFromPage( + playwrightContext.page, + selector, + options.baseUrl ?? + playwrightContext.request.loadedUrl ?? + playwrightContext.request.url, + ); + await result.enqueueLinks({ ...options, urls }); + }, + addRequests: result.addRequests, + pushData: result.pushData, + useState: this.allowStorageAccess(result.useState), + getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), + }), + ); } return Reflect.get(target, propertyName, receiver); }, @@ -318,7 +347,9 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { } } - protected async runRequestHandlerWithPlainHTTP(crawlingContext: PlaywrightCrawlingContext): Promise> { + protected async runRequestHandlerWithPlainHTTP( + crawlingContext: PlaywrightCrawlingContext, + ): Promise> { const result = new RequestHandlerResult(this.config, AdaptivePlaywrightCrawler.CRAWLEE_STATE_KEY); const response = await crawlingContext.sendRequest({}); @@ -329,25 +360,35 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler { try { await withCheckedStorageAccess( () => { - throw new Error('Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler'); + throw new Error( + 'Directly accessing storage in a request handler is not allowed in AdaptivePlaywrightCrawler', + ); }, - async () => addTimeoutToPromise( - async () => this.adaptiveRequestHandler({ - request: crawlingContext.request, - log: crawlingContext.log, - querySelector: (selector) => $(selector) as Cheerio, - enqueueLinks: async (options: Parameters[0] = {}) => { - const urls = extractUrlsFromCheerio($, options.selector, options.baseUrl ?? loadedUrl); - await result.enqueueLinks({ ...options, urls }); - }, - addRequests: result.addRequests, - pushData: result.pushData, - useState: this.allowStorageAccess(result.useState), - getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), - }), - this.requestHandlerTimeoutInnerMillis, - 'Request handler timed out', - ), + async () => + addTimeoutToPromise( + async () => + this.adaptiveRequestHandler({ + request: crawlingContext.request, + log: crawlingContext.log, + querySelector: (selector) => $(selector) as Cheerio, + enqueueLinks: async ( + options: Parameters[0] = {}, + ) => { + const urls = extractUrlsFromCheerio( + $, + options.selector, + options.baseUrl ?? loadedUrl, + ); + await result.enqueueLinks({ ...options, urls }); + }, + addRequests: result.addRequests, + pushData: result.pushData, + useState: this.allowStorageAccess(result.useState), + getKeyValueStore: this.allowStorageAccess(result.getKeyValueStore), + }), + this.requestHandlerTimeoutInnerMillis, + 'Request handler timed out', + ), ); return { result, ok: true }; diff --git a/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts b/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts index 3b4a472475ac..897e7da974c6 100644 --- a/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts +++ b/packages/playwright-crawler/src/internals/enqueue-links/click-elements.ts @@ -19,12 +19,7 @@ import { } from '@crawlee/browser'; import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types'; import ow from 'ow'; -import type { - Page, - Frame, - Request, - Route, -} from 'playwright'; +import type { Page, Frame, Request, Route } from 'playwright'; const STARTING_Z_INDEX = 2147400000; const log = log_.child({ prefix: 'Playwright Click Elements' }); @@ -215,32 +210,28 @@ export interface EnqueueLinksByClickingElementsOptions { * * @returns Promise that resolves to {@apilink BatchAddRequestsResult} object. */ -export async function enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise { - ow(options, ow.object.exactShape({ - page: ow.object.hasKeys('goto', 'evaluate'), - requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), - selector: ow.string, - userData: ow.optional.object, - clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), - pseudoUrls: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('purl'), - )), - globs: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('glob'), - )), - regexps: ow.optional.array.ofType(ow.any( - ow.regExp, - ow.object.hasKeys('regexp'), - )), - transformRequestFunction: ow.optional.function, - waitForPageIdleSecs: ow.optional.number, - maxWaitForPageIdleSecs: ow.optional.number, - label: ow.optional.string, - forefront: ow.optional.boolean, - skipNavigation: ow.optional.boolean, - })); +export async function enqueueLinksByClickingElements( + options: EnqueueLinksByClickingElementsOptions, +): Promise { + ow( + options, + ow.object.exactShape({ + page: ow.object.hasKeys('goto', 'evaluate'), + requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), + selector: ow.string, + userData: ow.optional.object, + clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), + pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), + globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), + regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), + transformRequestFunction: ow.optional.function, + waitForPageIdleSecs: ow.optional.number, + maxWaitForPageIdleSecs: ow.optional.number, + label: ow.optional.string, + forefront: ow.optional.boolean, + skipNavigation: ow.optional.boolean, + }), + ); const { page, @@ -308,14 +299,10 @@ interface ClickElementsAndInterceptNavigationRequestsOptions extends WaitForPage * Returns a list of all target URLs. * @ignore */ -export async function clickElementsAndInterceptNavigationRequests(options: ClickElementsAndInterceptNavigationRequestsOptions): Promise { - const { - page, - selector, - waitForPageIdleMillis, - maxWaitForPageIdleMillis, - clickOptions, - } = options; +export async function clickElementsAndInterceptNavigationRequests( + options: ClickElementsAndInterceptNavigationRequestsOptions, +): Promise { + const { page, selector, waitForPageIdleMillis, maxWaitForPageIdleMillis, clickOptions } = options; const uniqueRequests = new Set(); const context = page.context(); @@ -347,15 +334,20 @@ export async function clickElementsAndInterceptNavigationRequests(options: Click /** * @ignore */ -function createInterceptRequestHandler(page: Page, requests: Set): (route: Route, request: Request) => Promise { +function createInterceptRequestHandler( + page: Page, + requests: Set, +): (route: Route, request: Request) => Promise { return async function onInterceptedRequest(route, request) { if (!isTopFrameNavigationRequest(page, request)) return route.continue(); - requests.add(JSON.stringify({ - url: request.url(), - headers: request.headers(), - method: request.method(), - payload: request.postData() ?? undefined, - })); + requests.add( + JSON.stringify({ + url: request.url(), + headers: request.headers(), + method: request.method(), + payload: request.postData() ?? undefined, + }), + ); if (request.redirectedFrom()) { return route.fulfill({ body: '' }); // Prevents 301/302 redirect @@ -386,8 +378,7 @@ function createTargetCreatedHandler(requests: Set): (popup: Page) => Pro * @ignore */ function isTopFrameNavigationRequest(page: Page, req: Request): boolean { - return req.isNavigationRequest() - && req.frame() === page.mainFrame(); + return req.isNavigationRequest() && req.frame() === page.mainFrame(); } /** @@ -476,15 +467,19 @@ export async function clickElements(page: Page, selector: string, clickOptions?: } catch (err) { const e = err as Error; if (shouldLogWarning && e.stack!.includes('is detached from document')) { - log.warning(`An element with selector ${selector} that you're trying to click has been removed from the page. ` - + 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. ' - + 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.'); + log.warning( + `An element with selector ${selector} that you're trying to click has been removed from the page. ` + + 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. ' + + 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.', + ); shouldLogWarning = false; } log.debug('enqueueLinksByClickingElements: Click failed.', { stack: e.stack }); } } - log.debug(`enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`); + log.debug( + `enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`, + ); } /** @@ -501,7 +496,11 @@ export async function clickElements(page: Page, selector: string, clickOptions?: * when there's only a single element to click. * @ignore */ -async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis }: WaitForPageIdleOptions): Promise { +async function waitForPageIdle({ + page, + waitForPageIdleMillis, + maxWaitForPageIdleMillis, +}: WaitForPageIdleOptions): Promise { return new Promise((resolve) => { let timeout: NodeJS.Timeout; let maxTimeout: NodeJS.Timeout; @@ -517,15 +516,15 @@ async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdle } function maxTimeoutHandler() { - log.debug(`enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. ` - + 'This is probably due to the website itself dispatching requests, but some links may also have been missed.'); + log.debug( + `enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. ` + + 'This is probably due to the website itself dispatching requests, but some links may also have been missed.', + ); finish(); } function finish() { - page.off('request', activityHandler) - .off('framenavigated', activityHandler) - .off('popup', activityHandler); + page.off('request', activityHandler).off('framenavigated', activityHandler).off('popup', activityHandler); resolve(); } diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index d87f5f51293b..cb152c940c7c 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -17,16 +17,15 @@ import { PlaywrightLauncher } from './playwright-launcher'; import type { DirectNavigationOptions, PlaywrightContextUtils } from './utils/playwright-utils'; import { gotoExtended, registerUtilsToContext } from './utils/playwright-utils'; -export interface PlaywrightCrawlingContext extends - BrowserCrawlingContext, PlaywrightContextUtils {} +export interface PlaywrightCrawlingContext + extends BrowserCrawlingContext, + PlaywrightContextUtils {} export interface PlaywrightHook extends BrowserHook {} export interface PlaywrightRequestHandler extends BrowserRequestHandler {} export type PlaywrightGotoOptions = Parameters[1]; -export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions< - PlaywrightCrawlingContext, - { browserPlugins: [PlaywrightPlugin] } -> { +export interface PlaywrightCrawlerOptions + extends BrowserCrawlerOptions { /** * The same options as used by {@apilink launchPlaywright}. */ @@ -187,7 +186,11 @@ export interface PlaywrightCrawlerOptions extends BrowserCrawlerOptions< * ``` * @category Crawlers */ -export class PlaywrightCrawler extends BrowserCrawler<{ browserPlugins: [PlaywrightPlugin] }, LaunchOptions, PlaywrightCrawlingContext> { +export class PlaywrightCrawler extends BrowserCrawler< + { browserPlugins: [PlaywrightPlugin] }, + LaunchOptions, + PlaywrightCrawlingContext +> { protected static override optionsShape = { ...BrowserCrawler.optionsShape, browserPoolOptions: ow.optional.object, @@ -197,22 +200,23 @@ export class PlaywrightCrawler extends BrowserCrawler<{ browserPlugins: [Playwri /** * All `PlaywrightCrawler` parameters are passed via an options object. */ - constructor(options: PlaywrightCrawlerOptions = {}, override readonly config = Configuration.getGlobalConfig()) { + constructor( + options: PlaywrightCrawlerOptions = {}, + override readonly config = Configuration.getGlobalConfig(), + ) { ow(options, 'PlaywrightCrawlerOptions', ow.object.exactShape(PlaywrightCrawler.optionsShape)); - const { - launchContext = {}, - headless, - ...browserCrawlerOptions - } = options; + const { launchContext = {}, headless, ...browserCrawlerOptions } = options; const browserPoolOptions = { ...options.browserPoolOptions, } as BrowserPoolOptions; if (launchContext.proxyUrl) { - throw new Error('PlaywrightCrawlerOptions.launchContext.proxyUrl is not allowed in PlaywrightCrawler.' - + 'Use PlaywrightCrawlerOptions.proxyConfiguration'); + throw new Error( + 'PlaywrightCrawlerOptions.launchContext.proxyUrl is not allowed in PlaywrightCrawler.' + + 'Use PlaywrightCrawlerOptions.proxyConfiguration', + ); } // `browserPlugins` is working when it's not overriden by `launchContext`, @@ -228,9 +232,7 @@ export class PlaywrightCrawler extends BrowserCrawler<{ browserPlugins: [Playwri const playwrightLauncher = new PlaywrightLauncher(launchContext, config); - browserPoolOptions.browserPlugins = [ - playwrightLauncher.createBrowserPlugin(), - ]; + browserPoolOptions.browserPlugins = [playwrightLauncher.createBrowserPlugin()]; super({ ...browserCrawlerOptions, launchContext, browserPoolOptions }, config); } @@ -240,7 +242,10 @@ export class PlaywrightCrawler extends BrowserCrawler<{ browserPlugins: [Playwri await super._runRequestHandler(context); } - protected override async _navigationHandler(crawlingContext: PlaywrightCrawlingContext, gotoOptions: DirectNavigationOptions) { + protected override async _navigationHandler( + crawlingContext: PlaywrightCrawlingContext, + gotoOptions: DirectNavigationOptions, + ) { return gotoExtended(crawlingContext.page, crawlingContext.request, gotoOptions); } } diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index 43c355c155ae..f4fac6cf8fd4 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -49,17 +49,17 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext { ow(launchContext, 'PlaywrightLauncherOptions', ow.object.exactShape(PlaywrightLauncher.optionsShape)); const { - launcher = BrowserLauncher.requireLauncherOrThrow('playwright', 'apify/actor-node-playwright-*').chromium, + launcher = BrowserLauncher.requireLauncherOrThrow( + 'playwright', + 'apify/actor-node-playwright-*', + ).chromium, } = launchContext; const { launchOptions = {}, ...rest } = launchContext; - super({ - ...rest, - launchOptions: { - ...launchOptions, - executablePath: getDefaultExecutablePath(launchContext, config), + super( + { + ...rest, + launchOptions: { + ...launchOptions, + executablePath: getDefaultExecutablePath(launchContext, config), + }, + launcher, }, - launcher, - }, config); + config, + ); this.Plugin = PlaywrightPlugin; } @@ -171,7 +177,10 @@ function getDefaultExecutablePath(launchContext: PlaywrightLaunchContext, config * @returns * Promise that resolves to Playwright's `Browser` instance. */ -export async function launchPlaywright(launchContext?: PlaywrightLaunchContext, config = Configuration.getGlobalConfig()): Promise { +export async function launchPlaywright( + launchContext?: PlaywrightLaunchContext, + config = Configuration.getGlobalConfig(), +): Promise { const playwrightLauncher = new PlaywrightLauncher(launchContext, config); return playwrightLauncher.launch(); diff --git a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts index 10677d3d0e8e..91057f564990 100644 --- a/packages/playwright-crawler/src/internals/utils/playwright-utils.ts +++ b/packages/playwright-crawler/src/internals/utils/playwright-utils.ts @@ -87,9 +87,12 @@ const injectedFilesCache = new LruCache({ maxLength: MAX_INJECT_FILE_CACHE_SIZE export async function injectFile(page: Page, filePath: string, options: InjectFileOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); ow(filePath, ow.string); - ow(options, ow.object.exactShape({ - surviveNavigations: ow.optional.boolean, - })); + ow( + options, + ow.object.exactShape({ + surviveNavigations: ow.optional.boolean, + }), + ); let contents = injectedFilesCache.get(filePath); if (!contents) { @@ -99,9 +102,11 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi const evalP = page.evaluate(contents); if (options.surviveNavigations) { - page.on('framenavigated', - async () => page.evaluate(contents) - .catch((error) => log.warning('An error occurred during the script injection!', { error }))); + page.on('framenavigated', async () => + page + .evaluate(contents) + .catch((error) => log.warning('An error occurred during the script injection!', { error })), + ); } return evalP; @@ -173,14 +178,21 @@ export interface DirectNavigationOptions { * @param request * @param [gotoOptions] Custom options for `page.goto()`. */ -export async function gotoExtended(page: Page, request: Request, gotoOptions: DirectNavigationOptions = {}): Promise { +export async function gotoExtended( + page: Page, + request: Request, + gotoOptions: DirectNavigationOptions = {}, +): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(request, ow.object.partialShape({ - url: ow.string.url, - method: ow.optional.string, - headers: ow.optional.object, - payload: ow.optional.any(ow.string, ow.buffer), - })); + ow( + request, + ow.object.partialShape({ + url: ow.string.url, + method: ow.optional.string, + headers: ow.optional.object, + payload: ow.optional.any(ow.string, ow.buffer), + }), + ); ow(gotoOptions, ow.object); const { url, method, headers, payload } = request; @@ -188,8 +200,10 @@ export async function gotoExtended(page: Page, request: Request, gotoOptions: Di if (method !== 'GET' || payload || !isEmpty(headers)) { // This is not deprecated, we use it to log only once. - log.deprecated('Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' - + 'in recent versions of Playwright. Use only when necessary.'); + log.deprecated( + 'Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' + + 'in recent versions of Playwright. Use only when necessary.', + ); let wasCalled = false; const interceptRequestHandler = async (route: Route) => { try { @@ -266,15 +280,15 @@ export async function gotoExtended(page: Page, request: Request, gotoOptions: Di */ export async function blockRequests(page: Page, options: BlockRequestsOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - urlPatterns: ow.optional.array.ofType(ow.string), - extraUrlPatterns: ow.optional.array.ofType(ow.string), - })); + ow( + options, + ow.object.exactShape({ + urlPatterns: ow.optional.array.ofType(ow.string), + extraUrlPatterns: ow.optional.array.ofType(ow.string), + }), + ); - const { - urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, - extraUrlPatterns = [], - } = options; + const { urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, extraUrlPatterns = [] } = options; const patternsToBlock = [...urlPatterns, ...extraUrlPatterns]; @@ -381,16 +395,26 @@ export interface InfiniteScrollOptions { */ export async function infiniteScroll(page: Page, options: InfiniteScrollOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - timeoutSecs: ow.optional.number, - maxScrollHeight: ow.optional.number, - waitForSecs: ow.optional.number, - scrollDownAndUp: ow.optional.boolean, - buttonSelector: ow.optional.string, - stopScrollCallback: ow.optional.function, - })); + ow( + options, + ow.object.exactShape({ + timeoutSecs: ow.optional.number, + maxScrollHeight: ow.optional.number, + waitForSecs: ow.optional.number, + scrollDownAndUp: ow.optional.boolean, + buttonSelector: ow.optional.string, + stopScrollCallback: ow.optional.function, + }), + ); - const { timeoutSecs = 0, maxScrollHeight = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback } = options; + const { + timeoutSecs = 0, + maxScrollHeight = 0, + waitForSecs = 4, + scrollDownAndUp = false, + buttonSelector, + stopScrollCallback, + } = options; let finished; const startTime = Date.now(); @@ -446,7 +470,7 @@ export async function infiniteScroll(page: Page, options: InfiniteScrollOptions const maybeClickButton = async () => { const button = await page.$(buttonSelector!); // Box model returns null if the button is not visible - if (button && await button.boundingBox()) { + if (button && (await button.boundingBox())) { await button.click({ delay: 10 }); } }; @@ -514,14 +538,17 @@ export interface SaveSnapshotOptions { */ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - key: ow.optional.string.nonEmpty, - screenshotQuality: ow.optional.number, - saveScreenshot: ow.optional.boolean, - saveHtml: ow.optional.boolean, - keyValueStoreName: ow.optional.string, - config: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + key: ow.optional.string.nonEmpty, + screenshotQuality: ow.optional.number, + saveScreenshot: ow.optional.boolean, + saveHtml: ow.optional.boolean, + keyValueStoreName: ow.optional.string, + config: ow.optional.object, + }), + ); const { key = 'SNAPSHOT', @@ -533,11 +560,18 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} } = options; try { - const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); + const store = await KeyValueStore.open(keyValueStoreName, { + config: config ?? Configuration.getGlobalConfig(), + }); if (saveScreenshot) { const screenshotName = `${key}.jpg`; - const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg', animations: 'disabled' }); + const screenshotBuffer = await page.screenshot({ + fullPage: true, + quality: screenshotQuality, + type: 'jpeg', + animations: 'disabled', + }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } @@ -716,34 +750,36 @@ export interface PlaywrightContextUtils { * * @returns Promise that resolves to {@apilink BatchAddRequestsResult} object. */ - enqueueLinksByClickingElements(options: Omit): Promise; + enqueueLinksByClickingElements( + options: Omit, + ): Promise; /** - * Compiles a Playwright script into an async function that may be executed at any time - * by providing it with the following object: - * ``` - * { - * page: Page, - * request: Request, - * } - * ``` - * Where `page` is a Playwright [`Page`](https://playwright.dev/docs/api/class-page) - * and `request` is a {@apilink Request}. - * - * The function is compiled by using the `scriptString` parameter as the function's body, - * so any limitations to function bodies apply. Return value of the compiled function - * is the return value of the function body = the `scriptString` parameter. - * - * As a security measure, no globals such as `process` or `require` are accessible - * from within the function body. Note that the function does not provide a safe - * sandbox and even though globals are not easily accessible, malicious code may - * still execute in the main process via prototype manipulation. Therefore you - * should only use this function to execute sanitized or safe code. - * - * Custom context may also be provided using the `context` parameter. To improve security, - * make sure to only pass the really necessary objects to the context. Preferably making - * secured copies beforehand. - */ + * Compiles a Playwright script into an async function that may be executed at any time + * by providing it with the following object: + * ``` + * { + * page: Page, + * request: Request, + * } + * ``` + * Where `page` is a Playwright [`Page`](https://playwright.dev/docs/api/class-page) + * and `request` is a {@apilink Request}. + * + * The function is compiled by using the `scriptString` parameter as the function's body, + * so any limitations to function bodies apply. Return value of the compiled function + * is the return value of the function body = the `scriptString` parameter. + * + * As a security measure, no globals such as `process` or `require` are accessible + * from within the function body. Note that the function does not provide a safe + * sandbox and even though globals are not easily accessible, malicious code may + * still execute in the main process via prototype manipulation. Therefore you + * should only use this function to execute sanitized or safe code. + * + * Custom context may also be provided using the `context` parameter. To improve security, + * make sure to only pass the really necessary objects to the context. Preferably making + * secured copies beforehand. + */ compileScript(scriptString: string, ctx?: Dictionary): CompiledScriptFunction; /** @@ -753,25 +789,32 @@ export interface PlaywrightContextUtils { } export function registerUtilsToContext(context: PlaywrightCrawlingContext): void { - context.injectFile = async (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options); - context.injectJQuery = (async () => { + context.injectFile = async (filePath: string, options?: InjectFileOptions) => + injectFile(context.page, filePath, options); + context.injectJQuery = async () => { if (context.request.state === RequestState.BEFORE_NAV) { - log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.'); + log.warning( + 'Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.', + ); await injectJQuery(context.page); return; } await injectJQuery(context.page, { surviveNavigations: false }); - }); + }; context.blockRequests = async (options?: BlockRequestsOptions) => blockRequests(context.page, options); context.parseWithCheerio = async () => parseWithCheerio(context.page); context.infiniteScroll = async (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options); - context.saveSnapshot = async (options?: SaveSnapshotOptions) => saveSnapshot(context.page, { ...options, config: context.crawler.config }); + context.saveSnapshot = async (options?: SaveSnapshotOptions) => + saveSnapshot(context.page, { ...options, config: context.crawler.config }); // eslint-disable-next-line max-len - context.enqueueLinksByClickingElements = async (options: Omit) => enqueueLinksByClickingElements({ - ...options, - page: context.page, - requestQueue: context.crawler.requestQueue!, - }); + context.enqueueLinksByClickingElements = async ( + options: Omit, + ) => + enqueueLinksByClickingElements({ + ...options, + page: context.page, + requestQueue: context.crawler.requestQueue!, + }); context.compileScript = (scriptString: string, ctx?: Dictionary) => compileScript(scriptString, ctx); context.closeCookieModals = async () => closeCookieModals(context.page); } diff --git a/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts b/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts index 9938a61c2cfa..ee837341e325 100644 --- a/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts +++ b/packages/playwright-crawler/src/internals/utils/rendering-type-prediction.ts @@ -2,7 +2,7 @@ import LogisticRegression from 'ml-logistic-regression'; import { Matrix } from 'ml-matrix'; import stringComparison from 'string-comparison'; -export type RenderingType = 'clientOnly' | 'static' +export type RenderingType = 'clientOnly' | 'static'; type URLComponents = string[]; @@ -52,18 +52,27 @@ export class RenderingTypePredictor { /** * Predict the rendering type for a given URL and request label. */ - public predict(url: URL, label: string | undefined): { renderingType: RenderingType; detectionProbabilityRecommendation: number } { + public predict( + url: URL, + label: string | undefined, + ): { renderingType: RenderingType; detectionProbabilityRecommendation: number } { if (this.logreg.classifiers.length === 0) { return { renderingType: 'clientOnly', detectionProbabilityRecommendation: 1 }; } const urlFeature = new Matrix([this.calculateFeatureVector(urlComponents(url), label)]); const [prediction] = this.logreg.predict(urlFeature); - const scores = [this.logreg.classifiers[0].testScores(urlFeature), this.logreg.classifiers[1].testScores(urlFeature)]; + const scores = [ + this.logreg.classifiers[0].testScores(urlFeature), + this.logreg.classifiers[1].testScores(urlFeature), + ]; return { renderingType: prediction === 1 ? 'static' : 'clientOnly', - detectionProbabilityRecommendation: Math.abs(scores[0] - scores[1]) < 0.1 ? 1 : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)), + detectionProbabilityRecommendation: + Math.abs(scores[0] - scores[1]) < 0.1 + ? 1 + : this.detectionRatio * Math.max(1, 5 - this.resultCount(label)), }; } @@ -91,8 +100,16 @@ export class RenderingTypePredictor { protected calculateFeatureVector(url: URLComponents, label: string | undefined): FeatureVector { return [ - mean((this.renderingTypeDetectionResults.get('static')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, - mean((this.renderingTypeDetectionResults.get('clientOnly')?.get(label) ?? []).map((otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0)) ?? 0, + mean( + (this.renderingTypeDetectionResults.get('static')?.get(label) ?? []).map( + (otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0, + ), + ) ?? 0, + mean( + (this.renderingTypeDetectionResults.get('clientOnly')?.get(label) ?? []).map( + (otherUrl) => calculateUrlSimilarity(url, otherUrl) ?? 0, + ), + ) ?? 0, ]; } diff --git a/packages/playwright-crawler/src/logistic-regression.d.ts b/packages/playwright-crawler/src/logistic-regression.d.ts index c4b501a85756..b9ac288799b5 100644 --- a/packages/playwright-crawler/src/logistic-regression.d.ts +++ b/packages/playwright-crawler/src/logistic-regression.d.ts @@ -1,22 +1,22 @@ declare module 'ml-logistic-regression' { - import Matrix from 'ml-matrix'; + import Matrix from 'ml-matrix'; - class LogisticRegressionTwoClasses { - testScores(Xtest: Matrix): number; - } + class LogisticRegressionTwoClasses { + testScores(Xtest: Matrix): number; + } - export default class LogisticRegression { - classifiers: LogisticRegressionTwoClasses[]; + export default class LogisticRegression { + classifiers: LogisticRegressionTwoClasses[]; - constructor( - options: Partial<{ - numSteps: number; - learningRate: number; - }>, - ); + constructor( + options: Partial<{ + numSteps: number; + learningRate: number; + }>, + ); - train(X: Matrix, Y: Matrix): void; + train(X: Matrix, Y: Matrix): void; - predict(Xtest: Matrix): number[]; - } + predict(Xtest: Matrix): number[]; + } } diff --git a/packages/playwright-crawler/tsconfig.build.json b/packages/playwright-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/playwright-crawler/tsconfig.build.json +++ b/packages/playwright-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts b/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts index b659f16ac906..e34ffc9e9048 100644 --- a/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts +++ b/packages/puppeteer-crawler/src/internals/enqueue-links/click-elements.ts @@ -19,13 +19,7 @@ import { } from '@crawlee/browser'; import type { Dictionary, BatchAddRequestsResult } from '@crawlee/types'; import ow from 'ow'; -import type { - ClickOptions, - Frame, - HTTPRequest as PuppeteerRequest, - Page, - Target, -} from 'puppeteer'; +import type { ClickOptions, Frame, HTTPRequest as PuppeteerRequest, Page, Target } from 'puppeteer'; import { addInterceptRequestHandler, removeInterceptRequestHandler } from '../utils/puppeteer_request_interception'; @@ -216,32 +210,28 @@ export interface EnqueueLinksByClickingElementsOptions { * * @returns Promise that resolves to {@apilink BatchAddRequestsResult} object. */ -export async function enqueueLinksByClickingElements(options: EnqueueLinksByClickingElementsOptions): Promise { - ow(options, ow.object.exactShape({ - page: ow.object.hasKeys('goto', 'evaluate'), - requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), - selector: ow.string, - userData: ow.optional.object, - clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), - pseudoUrls: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('purl'), - )), - globs: ow.optional.array.ofType(ow.any( - ow.string, - ow.object.hasKeys('glob'), - )), - regexps: ow.optional.array.ofType(ow.any( - ow.regExp, - ow.object.hasKeys('regexp'), - )), - transformRequestFunction: ow.optional.function, - waitForPageIdleSecs: ow.optional.number, - maxWaitForPageIdleSecs: ow.optional.number, - label: ow.optional.string, - forefront: ow.optional.boolean, - skipNavigation: ow.optional.boolean, - })); +export async function enqueueLinksByClickingElements( + options: EnqueueLinksByClickingElementsOptions, +): Promise { + ow( + options, + ow.object.exactShape({ + page: ow.object.hasKeys('goto', 'evaluate'), + requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'), + selector: ow.string, + userData: ow.optional.object, + clickOptions: ow.optional.object.hasKeys('clickCount', 'delay'), + pseudoUrls: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('purl'))), + globs: ow.optional.array.ofType(ow.any(ow.string, ow.object.hasKeys('glob'))), + regexps: ow.optional.array.ofType(ow.any(ow.regExp, ow.object.hasKeys('regexp'))), + transformRequestFunction: ow.optional.function, + waitForPageIdleSecs: ow.optional.number, + maxWaitForPageIdleSecs: ow.optional.number, + label: ow.optional.string, + forefront: ow.optional.boolean, + skipNavigation: ow.optional.boolean, + }), + ); const { page, @@ -309,14 +299,10 @@ interface ClickElementsAndInterceptNavigationRequestsOptions extends WaitForPage * Returns a list of all target URLs. * @ignore */ -export async function clickElementsAndInterceptNavigationRequests(options: ClickElementsAndInterceptNavigationRequestsOptions): Promise { - const { - page, - selector, - waitForPageIdleMillis, - maxWaitForPageIdleMillis, - clickOptions, - } = options; +export async function clickElementsAndInterceptNavigationRequests( + options: ClickElementsAndInterceptNavigationRequestsOptions, +): Promise { + const { page, selector, waitForPageIdleMillis, maxWaitForPageIdleMillis, clickOptions } = options; const uniqueRequests = new Set(); const browser = page.browser(); @@ -351,12 +337,14 @@ function createInterceptRequestHandler(page: Page, requests: Set): (req: return async function onInterceptedRequest(req) { if (!isTopFrameNavigationRequest(page, req)) return req.continue(); const url = req.url(); - requests.add(JSON.stringify({ - url, - headers: req.headers(), - method: req.method(), - payload: req.postData(), - })); + requests.add( + JSON.stringify({ + url, + headers: req.headers(), + method: req.method(), + payload: req.postData(), + }), + ); if (req.redirectChain().length) { await req.respond({ body: '' }); // Prevents 301/302 redirect @@ -370,8 +358,7 @@ function createInterceptRequestHandler(page: Page, requests: Set): (req: * @ignore */ function isTopFrameNavigationRequest(page: Page, req: PuppeteerRequest): boolean { - return req.isNavigationRequest() - && req.frame() === page.mainFrame(); + return req.isNavigationRequest() && req.frame() === page.mainFrame(); } /** @@ -399,8 +386,7 @@ function createTargetCreatedHandler(page: Page, requests: Set): (target: * There will generally be a lot of other targets being created in the browser. */ export function isTargetRelevant(page: Page, target: Target): boolean { - return target.type() === 'page' - && page.target() === target.opener(); + return target.type() === 'page' && page.target() === target.opener(); } /** @@ -472,15 +458,19 @@ export async function clickElements(page: Page, selector: string, clickOptions?: } catch (err) { const e = err as Error; if (shouldLogWarning && e.stack!.includes('is detached from document')) { - log.warning(`An element with selector ${selector} that you're trying to click has been removed from the page. ` - + 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. ' - + 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.'); + log.warning( + `An element with selector ${selector} that you're trying to click has been removed from the page. ` + + 'This was probably caused by an earlier click which triggered some JavaScript on the page that caused it to change. ' + + 'If you\'re trying to enqueue pagination links, we suggest using the "next" button, if available and going one by one.', + ); shouldLogWarning = false; } log.debug('enqueueLinksByClickingElements: Click failed.', { stack: e.stack }); } } - log.debug(`enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`); + log.debug( + `enqueueLinksByClickingElements: Successfully clicked ${clickedElementsCount} elements out of ${elementHandles.length}`, + ); } /* istanbul ignore next */ @@ -514,7 +504,11 @@ function updateElementCssToEnableMouseClick(el: Element, zIndex: number): void { * when there's only a single element to click. * @ignore */ -async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdleMillis }: WaitForPageIdleOptions): Promise { +async function waitForPageIdle({ + page, + waitForPageIdleMillis, + maxWaitForPageIdleMillis, +}: WaitForPageIdleOptions): Promise { return new Promise((resolve) => { let timeout: NodeJS.Timeout; let maxTimeout: NodeJS.Timeout; @@ -533,8 +527,10 @@ async function waitForPageIdle({ page, waitForPageIdleMillis, maxWaitForPageIdle } function maxTimeoutHandler() { - log.debug(`enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. ` - + 'This is probably due to the website itself dispatching requests, but some links may also have been missed.'); + log.debug( + `enqueueLinksByClickingElements: Page still showed activity after ${maxWaitForPageIdleMillis}ms. ` + + 'This is probably due to the website itself dispatching requests, but some links may also have been missed.', + ); finish(); } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index 2682e654e3a0..3366f2b19f50 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -17,16 +17,15 @@ import { PuppeteerLauncher } from './puppeteer-launcher'; import type { DirectNavigationOptions, PuppeteerContextUtils } from './utils/puppeteer_utils'; import { gotoExtended, registerUtilsToContext } from './utils/puppeteer_utils'; -export interface PuppeteerCrawlingContext extends - BrowserCrawlingContext, PuppeteerContextUtils {} +export interface PuppeteerCrawlingContext + extends BrowserCrawlingContext, + PuppeteerContextUtils {} export interface PuppeteerHook extends BrowserHook {} export interface PuppeteerRequestHandler extends BrowserRequestHandler {} export type PuppeteerGoToOptions = Parameters[1]; -export interface PuppeteerCrawlerOptions extends BrowserCrawlerOptions< - PuppeteerCrawlingContext, - { browserPlugins: [PuppeteerPlugin] } -> { +export interface PuppeteerCrawlerOptions + extends BrowserCrawlerOptions { /** * Options used by {@apilink launchPuppeteer} to start new Puppeteer instances. */ @@ -132,7 +131,11 @@ export interface PuppeteerCrawlerOptions extends BrowserCrawlerOptions< * ``` * @category Crawlers */ -export class PuppeteerCrawler extends BrowserCrawler<{ browserPlugins: [PuppeteerPlugin] }, LaunchOptions, PuppeteerCrawlingContext> { +export class PuppeteerCrawler extends BrowserCrawler< + { browserPlugins: [PuppeteerPlugin] }, + LaunchOptions, + PuppeteerCrawlingContext +> { protected static override optionsShape = { ...BrowserCrawler.optionsShape, browserPoolOptions: ow.optional.object, @@ -141,23 +144,23 @@ export class PuppeteerCrawler extends BrowserCrawler<{ browserPlugins: [Puppetee /** * All `PuppeteerCrawler` parameters are passed via an options object. */ - constructor(options: PuppeteerCrawlerOptions = {}, override readonly config = Configuration.getGlobalConfig()) { + constructor( + options: PuppeteerCrawlerOptions = {}, + override readonly config = Configuration.getGlobalConfig(), + ) { ow(options, 'PuppeteerCrawlerOptions', ow.object.exactShape(PuppeteerCrawler.optionsShape)); - const { - launchContext = {}, - headless, - proxyConfiguration, - ...browserCrawlerOptions - } = options; + const { launchContext = {}, headless, proxyConfiguration, ...browserCrawlerOptions } = options; const browserPoolOptions = { ...options.browserPoolOptions, } as BrowserPoolOptions; if (launchContext.proxyUrl) { - throw new Error('PuppeteerCrawlerOptions.launchContext.proxyUrl is not allowed in PuppeteerCrawler.' - + 'Use PuppeteerCrawlerOptions.proxyConfiguration'); + throw new Error( + 'PuppeteerCrawlerOptions.launchContext.proxyUrl is not allowed in PuppeteerCrawler.' + + 'Use PuppeteerCrawlerOptions.proxyConfiguration', + ); } // `browserPlugins` is working when it's not overridden by `launchContext`, @@ -173,9 +176,7 @@ export class PuppeteerCrawler extends BrowserCrawler<{ browserPlugins: [Puppetee const puppeteerLauncher = new PuppeteerLauncher(launchContext, config); - browserPoolOptions.browserPlugins = [ - puppeteerLauncher.createBrowserPlugin(), - ]; + browserPoolOptions.browserPlugins = [puppeteerLauncher.createBrowserPlugin()]; super({ ...browserCrawlerOptions, launchContext, proxyConfiguration, browserPoolOptions }, config); } @@ -185,7 +186,10 @@ export class PuppeteerCrawler extends BrowserCrawler<{ browserPlugins: [Puppetee await super._runRequestHandler(context); } - protected override async _navigationHandler(crawlingContext: PuppeteerCrawlingContext, gotoOptions: DirectNavigationOptions) { + protected override async _navigationHandler( + crawlingContext: PuppeteerCrawlingContext, + gotoOptions: DirectNavigationOptions, + ) { return gotoExtended(crawlingContext.page, crawlingContext.request, gotoOptions); } } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 4f5f369a693b..fc6b13ee8769 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -91,17 +91,20 @@ export class PuppeteerLauncher extends BrowserLauncher ...browserLauncherOptions } = launchContext; - super({ - ...browserLauncherOptions, - launcher, - }, config); + super( + { + ...browserLauncherOptions, + launcher, + }, + config, + ); this.Plugin = PuppeteerPlugin; } protected override _getDefaultHeadlessOption(): boolean { const headless = super._getDefaultHeadlessOption(); - return headless ? 'new' as any : headless; + return headless ? ('new' as any) : headless; } } @@ -137,7 +140,10 @@ export class PuppeteerLauncher extends BrowserLauncher * @returns * Promise that resolves to Puppeteer's `Browser` instance. */ -export async function launchPuppeteer(launchContext?: PuppeteerLaunchContext, config = Configuration.getGlobalConfig()): Promise { +export async function launchPuppeteer( + launchContext?: PuppeteerLaunchContext, + config = Configuration.getGlobalConfig(), +): Promise { const puppeteerLauncher = new PuppeteerLauncher(launchContext, config); return puppeteerLauncher.launch(); diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_request_interception.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_request_interception.ts index 7f5fc391a838..36c6f2db5330 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_request_interception.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_request_interception.ts @@ -43,7 +43,8 @@ function browserifyHeaders(headers: Record): Record = {}; // eslint-disable-next-line prefer-const for (let [key, value] of Object.entries(headers)) { - key = key.toLowerCase() + key = key + .toLowerCase() .split('-') .map((str) => str.charAt(0).toUpperCase() + str.slice(1)) .join('-'); @@ -202,9 +203,7 @@ export async function removeInterceptRequestHandler(page: Page, handler: Interce ow(page, ow.object.hasKeys('goto', 'evaluate')); ow(handler, ow.function); - const handlersArray = pageInterceptRequestHandlersMap - .get(page)! - .filter((item) => item !== handler); + const handlersArray = pageInterceptRequestHandlersMap.get(page)!.filter((item) => item !== handler); pageInterceptRequestHandlersMap.set(page, handlersArray); diff --git a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts index 2202fa76430e..cf0603d35e9f 100644 --- a/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts +++ b/packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts @@ -119,9 +119,12 @@ const injectedFilesCache = new LruCache({ maxLength: MAX_INJECT_FILE_CACHE_SIZE export async function injectFile(page: Page, filePath: string, options: InjectFileOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); ow(filePath, ow.string); - ow(options, ow.object.exactShape({ - surviveNavigations: ow.optional.boolean, - })); + ow( + options, + ow.object.exactShape({ + surviveNavigations: ow.optional.boolean, + }), + ); let contents = injectedFilesCache.get(filePath); if (!contents) { @@ -130,9 +133,11 @@ export async function injectFile(page: Page, filePath: string, options: InjectFi } const evalP = page.evaluate(contents); if (options.surviveNavigations) { - page.on('framenavigated', - async () => page.evaluate(contents) - .catch((error) => log.warning('An error occurred during the script injection!', { error }))); + page.on('framenavigated', async () => + page + .evaluate(contents) + .catch((error) => log.warning('An error occurred during the script injection!', { error })), + ); } return evalP; @@ -230,15 +235,15 @@ export async function parseWithCheerio(page: Page): Promise { */ export async function blockRequests(page: Page, options: BlockRequestsOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - urlPatterns: ow.optional.array.ofType(ow.string), - extraUrlPatterns: ow.optional.array.ofType(ow.string), - })); + ow( + options, + ow.object.exactShape({ + urlPatterns: ow.optional.array.ofType(ow.string), + extraUrlPatterns: ow.optional.array.ofType(ow.string), + }), + ); - const { - urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, - extraUrlPatterns = [], - } = options; + const { urlPatterns = DEFAULT_BLOCK_REQUEST_URL_PATTERNS, extraUrlPatterns = [] } = options; const patternsToBlock = [...urlPatterns, ...extraUrlPatterns]; @@ -271,7 +276,9 @@ export async function sendCDPCommand( const parsed = JSON.parse(await readFile(jsonPath, 'utf-8')); // eslint-disable-next-line max-len - throw new Error(`Cannot detect CDP client for Puppeteer ${parsed.version}. You should report this to Crawlee, mentioning the puppeteer version you are using.`); + throw new Error( + `Cannot detect CDP client for Puppeteer ${parsed.version}. You should report this to Crawlee, mentioning the puppeteer version you are using.`, + ); } /** @@ -280,8 +287,10 @@ export async function sendCDPCommand( * @deprecated */ export const blockResources = async (page: Page, resourceTypes = ['stylesheet', 'font', 'image', 'media']) => { - log.deprecated('utils.puppeteer.blockResources() has a high impact on performance in recent versions of Puppeteer. ' - + 'Until this resolves, please use utils.puppeteer.blockRequests()'); + log.deprecated( + 'utils.puppeteer.blockResources() has a high impact on performance in recent versions of Puppeteer. ' + + 'Until this resolves, please use utils.puppeteer.blockRequests()', + ); await addInterceptRequestHandler(page, async (request) => { const type = request.resourceType(); if (resourceTypes.includes(type)) await request.abort(); @@ -305,13 +314,19 @@ export const blockResources = async (page: Page, resourceTypes = ['stylesheet', * String rules are compared as page.url().includes(rule) while RegExp rules are evaluated as rule.test(page.url()). * @deprecated */ -export async function cacheResponses(page: Page, cache: Dictionary>, responseUrlRules: (string | RegExp)[]): Promise { +export async function cacheResponses( + page: Page, + cache: Dictionary>, + responseUrlRules: (string | RegExp)[], +): Promise { ow(page, ow.object.validate(validators.browserPage)); ow(cache, ow.object); ow(responseUrlRules, ow.array.ofType(ow.any(ow.string, ow.regExp))); - log.deprecated('utils.puppeteer.cacheResponses() has a high impact on performance ' - + 'in recent versions of Puppeteer so it\'s use is discouraged until this issue resolves.'); + log.deprecated( + 'utils.puppeteer.cacheResponses() has a high impact on performance ' + + "in recent versions of Puppeteer so it's use is discouraged until this issue resolves.", + ); await addInterceptRequestHandler(page, async (request) => { const url = request.url(); @@ -405,14 +420,21 @@ export function compileScript(scriptString: string, context: Dictionary = Object * @param request * @param [gotoOptions] Custom options for `page.goto()`. */ -export async function gotoExtended(page: Page, request: Request, gotoOptions: DirectNavigationOptions = {}): Promise { +export async function gotoExtended( + page: Page, + request: Request, + gotoOptions: DirectNavigationOptions = {}, +): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(request, ow.object.partialShape({ - url: ow.string.url, - method: ow.optional.string, - headers: ow.optional.object, - payload: ow.optional.any(ow.string, ow.buffer), - })); + ow( + request, + ow.object.partialShape({ + url: ow.string.url, + method: ow.optional.string, + headers: ow.optional.object, + payload: ow.optional.any(ow.string, ow.buffer), + }), + ); ow(gotoOptions, ow.object); const { url, method, headers, payload } = request; @@ -420,8 +442,10 @@ export async function gotoExtended(page: Page, request: Request, gotoOptions: Di if (method !== 'GET' || payload || !isEmpty(headers)) { // This is not deprecated, we use it to log only once. - log.deprecated('Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' - + 'in recent versions of Puppeteer. Use only when necessary.'); + log.deprecated( + 'Using other request methods than GET, rewriting headers and adding payloads has a high impact on performance ' + + 'in recent versions of Puppeteer. Use only when necessary.', + ); let wasCalled = false; const interceptRequestHandler = async (interceptedRequest: PuppeteerRequest) => { // We want to ensure that this won't get executed again in a case that there is a subsequent request @@ -490,16 +514,26 @@ export interface InfiniteScrollOptions { */ export async function infiniteScroll(page: Page, options: InfiniteScrollOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - timeoutSecs: ow.optional.number, - maxScrollHeight: ow.optional.number, - waitForSecs: ow.optional.number, - scrollDownAndUp: ow.optional.boolean, - buttonSelector: ow.optional.string, - stopScrollCallback: ow.optional.function, - })); + ow( + options, + ow.object.exactShape({ + timeoutSecs: ow.optional.number, + maxScrollHeight: ow.optional.number, + waitForSecs: ow.optional.number, + scrollDownAndUp: ow.optional.boolean, + buttonSelector: ow.optional.string, + stopScrollCallback: ow.optional.function, + }), + ); - const { timeoutSecs = 0, maxScrollHeight = 0, waitForSecs = 4, scrollDownAndUp = false, buttonSelector, stopScrollCallback } = options; + const { + timeoutSecs = 0, + maxScrollHeight = 0, + waitForSecs = 4, + scrollDownAndUp = false, + buttonSelector, + stopScrollCallback, + } = options; let finished; const startTime = Date.now(); @@ -577,7 +611,7 @@ export async function infiniteScroll(page: Page, options: InfiniteScrollOptions const maybeClickButton = async () => { const button = await page.$(buttonSelector!); // Box model returns null if the button is not visible - if (button && await button.boxModel()) { + if (button && (await button.boxModel())) { await button.click({ delay: 10 }); } }; @@ -645,14 +679,17 @@ export interface SaveSnapshotOptions { */ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {}): Promise { ow(page, ow.object.validate(validators.browserPage)); - ow(options, ow.object.exactShape({ - key: ow.optional.string.nonEmpty, - screenshotQuality: ow.optional.number, - saveScreenshot: ow.optional.boolean, - saveHtml: ow.optional.boolean, - keyValueStoreName: ow.optional.string, - config: ow.optional.object, - })); + ow( + options, + ow.object.exactShape({ + key: ow.optional.string.nonEmpty, + screenshotQuality: ow.optional.number, + saveScreenshot: ow.optional.boolean, + saveHtml: ow.optional.boolean, + keyValueStoreName: ow.optional.string, + config: ow.optional.object, + }), + ); const { key = 'SNAPSHOT', @@ -664,11 +701,17 @@ export async function saveSnapshot(page: Page, options: SaveSnapshotOptions = {} } = options; try { - const store = await KeyValueStore.open(keyValueStoreName, { config: config ?? Configuration.getGlobalConfig() }); + const store = await KeyValueStore.open(keyValueStoreName, { + config: config ?? Configuration.getGlobalConfig(), + }); if (saveScreenshot) { const screenshotName = `${key}.jpg`; - const screenshotBuffer = await page.screenshot({ fullPage: true, quality: screenshotQuality, type: 'jpeg' }); + const screenshotBuffer = await page.screenshot({ + fullPage: true, + quality: screenshotQuality, + type: 'jpeg', + }); await store.setValue(screenshotName, screenshotBuffer, { contentType: 'image/jpeg' }); } @@ -778,7 +821,9 @@ export interface PuppeteerContextUtils { * * @returns Promise that resolves to {@apilink BatchAddRequestsResult} object. */ - enqueueLinksByClickingElements(options: Omit): Promise; + enqueueLinksByClickingElements( + options: Omit, + ): Promise; /** * Forces the Puppeteer browser tab to block loading URLs that match a provided pattern. @@ -839,7 +884,10 @@ export interface PuppeteerContextUtils { * String rules are compared as page.url().includes(rule) while RegExp rules are evaluated as rule.test(page.url()). * @deprecated */ - cacheResponses(cache: Dictionary>, responseUrlRules: (string | RegExp)[]): Promise; + cacheResponses( + cache: Dictionary>, + responseUrlRules: (string | RegExp)[], + ): Promise; /** * Compiles a Puppeteer script into an async function that may be executed at any time @@ -946,40 +994,48 @@ export interface PuppeteerContextUtils { /** @internal */ export function registerUtilsToContext(context: PuppeteerCrawlingContext): void { - context.injectFile = async (filePath: string, options?: InjectFileOptions) => injectFile(context.page, filePath, options); - context.injectJQuery = (async () => { + context.injectFile = async (filePath: string, options?: InjectFileOptions) => + injectFile(context.page, filePath, options); + context.injectJQuery = async () => { if (context.request.state === RequestState.BEFORE_NAV) { - log.warning('Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.'); + log.warning( + 'Using injectJQuery() in preNavigationHooks leads to unstable results. Use it in a postNavigationHook or a requestHandler instead.', + ); await injectJQuery(context.page); return; } await injectJQuery(context.page, { surviveNavigations: false }); - }); + }; context.parseWithCheerio = async () => parseWithCheerio(context.page); // eslint-disable-next-line max-len - context.enqueueLinksByClickingElements = async (options: Omit) => enqueueLinksByClickingElements({ - page: context.page, - requestQueue: context.crawler.requestQueue!, - ...options, - }); + context.enqueueLinksByClickingElements = async ( + options: Omit, + ) => + enqueueLinksByClickingElements({ + page: context.page, + requestQueue: context.crawler.requestQueue!, + ...options, + }); context.blockRequests = async (options?: BlockRequestsOptions) => blockRequests(context.page, options); context.blockResources = async (resourceTypes?: string[]) => blockResources(context.page, resourceTypes); - context.cacheResponses = async (cache: Dictionary>, responseUrlRules: (string | RegExp)[]) => { + context.cacheResponses = async ( + cache: Dictionary>, + responseUrlRules: (string | RegExp)[], + ) => { return cacheResponses(context.page, cache, responseUrlRules); }; context.compileScript = (scriptString: string, ctx?: Dictionary) => compileScript(scriptString, ctx); - context.addInterceptRequestHandler = async (handler: InterceptHandler) => addInterceptRequestHandler(context.page, handler); - context.removeInterceptRequestHandler = async (handler: InterceptHandler) => removeInterceptRequestHandler(context.page, handler); + context.addInterceptRequestHandler = async (handler: InterceptHandler) => + addInterceptRequestHandler(context.page, handler); + context.removeInterceptRequestHandler = async (handler: InterceptHandler) => + removeInterceptRequestHandler(context.page, handler); context.infiniteScroll = async (options?: InfiniteScrollOptions) => infiniteScroll(context.page, options); - context.saveSnapshot = async (options?: SaveSnapshotOptions) => saveSnapshot(context.page, { ...options, config: context.crawler.config }); + context.saveSnapshot = async (options?: SaveSnapshotOptions) => + saveSnapshot(context.page, { ...options, config: context.crawler.config }); context.closeCookieModals = async () => closeCookieModals(context.page); } -export { - enqueueLinksByClickingElements, - addInterceptRequestHandler, - removeInterceptRequestHandler, -}; +export { enqueueLinksByClickingElements, addInterceptRequestHandler, removeInterceptRequestHandler }; /** @internal */ export const puppeteerUtils = { diff --git a/packages/puppeteer-crawler/tsconfig.build.json b/packages/puppeteer-crawler/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/puppeteer-crawler/tsconfig.build.json +++ b/packages/puppeteer-crawler/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/templates/.eslintrc.json b/packages/templates/.eslintrc.json index a1e94417375c..b4c4cbd98a86 100644 --- a/packages/templates/.eslintrc.json +++ b/packages/templates/.eslintrc.json @@ -1,9 +1,9 @@ { - "root": true, - "extends": "../../.eslintrc.json", - "rules": { - "no-console": 0, - "@typescript-eslint/no-shadow": 0, - "@typescript-eslint/consistent-type-imports": 0 - } + "root": true, + "extends": "../../.eslintrc.json", + "rules": { + "no-console": 0, + "@typescript-eslint/no-shadow": 0, + "@typescript-eslint/consistent-type-imports": 0 + } } diff --git a/packages/templates/manifest.json b/packages/templates/manifest.json index 710a9be350b9..cfac04b32599 100644 --- a/packages/templates/manifest.json +++ b/packages/templates/manifest.json @@ -1,110 +1,110 @@ { - "templates": [ - { - "name": "getting-started-ts", - "description": "Getting started example [TypeScript]", - "files": [ - "src/main.ts", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md", - "tsconfig.json" - ] - }, - { - "name": "getting-started-js", - "description": "Getting started example [JavaScript]", - "files": [ - "src/main.js", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md" - ] - }, - { - "name": "cheerio-ts", - "description": "CheerioCrawler template project [TypeScript]", - "files": [ - "src/main.ts", - "src/routes.ts", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md", - "tsconfig.json" - ] - }, - { - "name": "playwright-ts", - "description": "PlaywrightCrawler template project [TypeScript]", - "files": [ - "src/main.ts", - "src/routes.ts", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md", - "tsconfig.json" - ] - }, - { - "name": "puppeteer-ts", - "description": "PuppeteerCrawler template project [TypeScript]", - "files": [ - "src/main.ts", - "src/routes.ts", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md", - "tsconfig.json" - ] - }, - { - "name": "cheerio-js", - "description": "CheerioCrawler template project [JavaScript]", - "files": [ - "src/main.js", - "src/routes.js", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md" - ] - }, - { - "name": "playwright-js", - "description": "PlaywrightCrawler template project [JavaScript]", - "files": [ - "src/main.js", - "src/routes.js", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md" - ] - }, - { - "name": "puppeteer-js", - "description": "PuppeteerCrawler template project [JavaScript]", - "files": [ - "src/main.js", - "src/routes.js", - ".dockerignore", - ".gitignore", - "Dockerfile", - "package.json", - "README.md" - ] - } - ] + "templates": [ + { + "name": "getting-started-ts", + "description": "Getting started example [TypeScript]", + "files": [ + "src/main.ts", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md", + "tsconfig.json" + ] + }, + { + "name": "getting-started-js", + "description": "Getting started example [JavaScript]", + "files": [ + "src/main.js", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md" + ] + }, + { + "name": "cheerio-ts", + "description": "CheerioCrawler template project [TypeScript]", + "files": [ + "src/main.ts", + "src/routes.ts", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md", + "tsconfig.json" + ] + }, + { + "name": "playwright-ts", + "description": "PlaywrightCrawler template project [TypeScript]", + "files": [ + "src/main.ts", + "src/routes.ts", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md", + "tsconfig.json" + ] + }, + { + "name": "puppeteer-ts", + "description": "PuppeteerCrawler template project [TypeScript]", + "files": [ + "src/main.ts", + "src/routes.ts", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md", + "tsconfig.json" + ] + }, + { + "name": "cheerio-js", + "description": "CheerioCrawler template project [JavaScript]", + "files": [ + "src/main.js", + "src/routes.js", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md" + ] + }, + { + "name": "playwright-js", + "description": "PlaywrightCrawler template project [JavaScript]", + "files": [ + "src/main.js", + "src/routes.js", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md" + ] + }, + { + "name": "puppeteer-js", + "description": "PuppeteerCrawler template project [JavaScript]", + "files": [ + "src/main.js", + "src/routes.js", + ".dockerignore", + ".gitignore", + "Dockerfile", + "package.json", + "README.md" + ] + } + ] } diff --git a/packages/templates/scripts/validate-manifest.mjs b/packages/templates/scripts/validate-manifest.mjs index 608ef9c27fa2..bccf62bb9491 100644 --- a/packages/templates/scripts/validate-manifest.mjs +++ b/packages/templates/scripts/validate-manifest.mjs @@ -35,10 +35,19 @@ for (const manifestTemplate of manifest.templates) { } catch (err) { if (err.code === 'ENOENT') { hasError = true; - console.error(`${colors.grey(`[${colors.yellow(manifestTemplate.name)}]:`)} Failed to find file ${colors.yellow(requiredFile)}`); + console.error( + `${colors.grey(`[${colors.yellow(manifestTemplate.name)}]:`)} Failed to find file ${colors.yellow( + requiredFile, + )}`, + ); console.error(err); } else { - console.warn(`${colors.grey(`[${colors.yellow(manifestTemplate.name)}]:`)} Failed to read file ${colors.yellow(requiredFile)}`, err); + console.warn( + `${colors.grey(`[${colors.yellow(manifestTemplate.name)}]:`)} Failed to read file ${colors.yellow( + requiredFile, + )}`, + err, + ); } } } diff --git a/packages/templates/src/index.ts b/packages/templates/src/index.ts index 5287f16a069b..942aac9576f8 100644 --- a/packages/templates/src/index.ts +++ b/packages/templates/src/index.ts @@ -36,26 +36,26 @@ export interface TemplateFile { export async function fetchManifest(): Promise { const rawManifest = await new Promise((resolve, reject) => { - https.get(MANIFEST_URL, (res) => { - let json = ''; - res - .on('data', (chunk) => { + https + .get(MANIFEST_URL, (res) => { + let json = ''; + res.on('data', (chunk) => { json += chunk; }) - .once('end', () => { - if (res.statusCode === 200) { - try { - const data = JSON.parse(json); - resolve(data); - } catch (e) { - reject(e); + .once('end', () => { + if (res.statusCode === 200) { + try { + const data = JSON.parse(json); + resolve(data); + } catch (e) { + reject(e); + } + } else { + reject(new Error(`Status: ${res.statusCode}\n${json}`)); } - } else { - reject(new Error(`Status: ${res.statusCode}\n${json}`)); - } - }) - .on('error', (err) => reject(err)); - }) + }) + .on('error', (err) => reject(err)); + }) .on('error', (err) => reject(err)); }); diff --git a/packages/templates/tsconfig.build.json b/packages/templates/tsconfig.build.json index e47ba56a77b0..2e04e9e9f921 100644 --- a/packages/templates/tsconfig.build.json +++ b/packages/templates/tsconfig.build.json @@ -1,9 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": [ - "src/**/*" - ] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/types/src/storages.ts b/packages/types/src/storages.ts index 4710d1de54aa..4d2af1739eb5 100644 --- a/packages/types/src/storages.ts +++ b/packages/types/src/storages.ts @@ -5,7 +5,6 @@ import type { AllowedHttpMethods, Dictionary } from './utility-types'; * {@apilink RequestQueue} functions as well as {@apilink enqueueLinks}. */ export interface QueueOperationInfo { - /** Indicates if request was already present in the queue. */ wasAlreadyPresent: boolean; @@ -14,7 +13,6 @@ export interface QueueOperationInfo { /** The ID of the added request */ requestId: string; - } export interface DatasetCollectionClientOptions { diff --git a/packages/types/tsconfig.build.json b/packages/types/tsconfig.build.json index 856db0f2100a..2e04e9e9f921 100644 --- a/packages/types/tsconfig.build.json +++ b/packages/types/tsconfig.build.json @@ -1,7 +1,7 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"] } diff --git a/packages/utils/src/internals/blocked.ts b/packages/utils/src/internals/blocked.ts index 2a4d4fc68297..07c2c922bdf9 100644 --- a/packages/utils/src/internals/blocked.ts +++ b/packages/utils/src/internals/blocked.ts @@ -1,6 +1,4 @@ -export const CLOUDFLARE_RETRY_CSS_SELECTORS = [ - '#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]', -]; +export const CLOUDFLARE_RETRY_CSS_SELECTORS = ['#turnstile-wrapper iframe[src^="https://challenges.cloudflare.com"]']; /** * CSS selectors for elements that should trigger a retry, as the crawler is likely getting blocked. diff --git a/packages/utils/src/internals/cheerio.ts b/packages/utils/src/internals/cheerio.ts index f306db4bec80..5c49626af586 100644 --- a/packages/utils/src/internals/cheerio.ts +++ b/packages/utils/src/internals/cheerio.ts @@ -8,7 +8,8 @@ export type CheerioRoot = ReturnType; // NOTE: We are skipping 'noscript' since it's content is evaluated as text, instead of HTML elements. That damages the results. const SKIP_TAGS_REGEX = /^(script|style|canvas|svg|noscript)$/i; -const BLOCK_TAGS_REGEX = /^(p|h1|h2|h3|h4|h5|h6|ol|ul|li|pre|address|blockquote|dl|div|fieldset|form|table|tr|select|option)$/i; +const BLOCK_TAGS_REGEX = + /^(p|h1|h2|h3|h4|h5|h6|ol|ul|li|pre|address|blockquote|dl|div|fieldset|form|table|tr|select|option)$/i; /** * The function converts a HTML document to a plain text. @@ -41,7 +42,10 @@ const BLOCK_TAGS_REGEX = /^(p|h1|h2|h3|h4|h5|h6|ol|ul|li|pre|address|blockquote| export function htmlToText(htmlOrCheerioElement: string | CheerioRoot): string { if (!htmlOrCheerioElement) return ''; - const $ = typeof htmlOrCheerioElement === 'function' ? htmlOrCheerioElement : cheerio.load(htmlOrCheerioElement, { decodeEntities: true }); + const $ = + typeof htmlOrCheerioElement === 'function' + ? htmlOrCheerioElement + : cheerio.load(htmlOrCheerioElement, { decodeEntities: true }); let text = ''; const process = (elems: Dictionary) => { @@ -105,12 +109,12 @@ export function extractUrlsFromCheerio($: CheerioAPI, selector: string = 'a', ba // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later. const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package. if (!isHrefAbsolute && !baseUrl) { - throw new Error(`An extracted URL: ${href} is relative and baseUrl is not set. ` - + 'Provide a baseUrl to automatically resolve relative URLs.'); + throw new Error( + `An extracted URL: ${href} is relative and baseUrl is not set. ` + + 'Provide a baseUrl to automatically resolve relative URLs.', + ); } - return baseUrl - ? tryAbsoluteURL(href, baseUrl) - : href; + return baseUrl ? tryAbsoluteURL(href, baseUrl) : href; }) .filter(Boolean) as string[]; } diff --git a/packages/utils/src/internals/debug.ts b/packages/utils/src/internals/debug.ts index 8d71f8fef535..efacc7e055d5 100644 --- a/packages/utils/src/internals/debug.ts +++ b/packages/utils/src/internals/debug.ts @@ -48,7 +48,10 @@ export function createRequestDebugInfo( retryCount: request.retryCount, errorMessages: request.errorMessages, // Puppeteer response has .status() function and NodeJS response, statusCode property. - statusCode: 'status' in response && response.status instanceof Function ? response.status() : (response as IncomingMessage).statusCode, + statusCode: + 'status' in response && response.status instanceof Function + ? response.status() + : (response as IncomingMessage).statusCode, ...additionalFields, }; } diff --git a/packages/utils/src/internals/error_tracker.ts b/packages/utils/src/internals/error_tracker.ts index 259a4c33b5bd..35277e070cff 100644 --- a/packages/utils/src/internals/error_tracker.ts +++ b/packages/utils/src/internals/error_tracker.ts @@ -3,7 +3,7 @@ import { inspect } from 'node:util'; /** * Node.js Error interface */ - interface ErrnoException extends Error { +interface ErrnoException extends Error { errno?: number | undefined; code?: string | number | undefined; path?: string | undefined; @@ -39,11 +39,7 @@ const getPathFromStackTrace = (stack: string[]) => { for (const line of stack) { const path = extractPathFromStackTraceLine(line); - if ( - path.startsWith('node:') - || path.includes('/node_modules/') - || path.includes('\\node_modules\\') - ) { + if (path.startsWith('node:') || path.includes('/node_modules/') || path.includes('\\node_modules\\')) { continue; } @@ -69,7 +65,12 @@ const getStackTraceGroup = (error: ErrnoException, storage: Record x.trim()).join('\n') : getPathFromStackTrace(stack!.slice(sliceAt)); + normalizedStackTrace = showFullStack + ? stack! + .slice(sliceAt) + .map((x) => x.trim()) + .join('\n') + : getPathFromStackTrace(stack!.slice(sliceAt)); } if (!normalizedStackTrace) { @@ -184,7 +185,7 @@ const normalizedCalculatePlaceholder = (a: string[], b: string[]) => { const output = calculatePlaceholder(a, b); // We can't be too general - if ((arrayCount(output, '_') / output.length) >= 0.5) { + if (arrayCount(output, '_') / output.length >= 0.5) { return ['_']; } @@ -193,10 +194,7 @@ const normalizedCalculatePlaceholder = (a: string[], b: string[]) => { // Merge A (missing placeholders) into B (can contain placeholders but does not have to) const mergeMessages = (a: string, b: string, storage: Record) => { - const placeholder = normalizedCalculatePlaceholder( - a.split(' '), - b.split(' '), - ).join(' '); + const placeholder = normalizedCalculatePlaceholder(a.split(' '), b.split(' ')).join(' '); if (placeholder === '_') { return undefined; @@ -223,9 +221,14 @@ const getErrorMessageGroup = (error: ErrnoException, storage: Record { - ow(options, ow.object.exactShape({ - url: ow.string.url, - encoding: ow.optional.string, - urlRegExp: ow.optional.regExp, - proxyUrl: ow.optional.string, - })); + ow( + options, + ow.object.exactShape({ + url: ow.string.url, + encoding: ow.optional.string, + urlRegExp: ow.optional.regExp, + proxyUrl: ow.optional.string, + }), + ); const { url, encoding = 'utf8', urlRegExp = URL_NO_COMMAS_REGEX, proxyUrl } = options; // Try to detect wrong urls and fix them. Currently, detects only sharing url instead of csv download one. @@ -69,10 +72,13 @@ export interface ExtractUrlsOptions { * Collects all URLs in an arbitrary string to an array, optionally using a custom regular expression. */ export function extractUrls(options: ExtractUrlsOptions): string[] { - ow(options, ow.object.exactShape({ - string: ow.string, - urlRegExp: ow.optional.regExp, - })); + ow( + options, + ow.object.exactShape({ + string: ow.string, + urlRegExp: ow.optional.regExp, + }), + ); const lines = options.string.split('\n'); const result: string[] = []; const urlRegExp = options.urlRegExp ?? URL_NO_COMMAS_REGEX; @@ -89,7 +95,7 @@ export function extractUrls(options: ExtractUrlsOptions): string[] { */ export function tryAbsoluteURL(href: string, baseUrl: string): string | undefined { try { - return (new URL(href, baseUrl)).href; + return new URL(href, baseUrl).href; } catch { return undefined; } diff --git a/packages/utils/src/internals/general.ts b/packages/utils/src/internals/general.ts index 65dd8814530c..4bfec532a3a5 100644 --- a/packages/utils/src/internals/general.ts +++ b/packages/utils/src/internals/general.ts @@ -5,22 +5,30 @@ import { setTimeout } from 'node:timers/promises'; * Default regular expression to match URLs in a string that may be plain text, JSON, CSV or other. It supports common URL characters * and does not support URLs containing commas or spaces. The URLs also may contain Unicode letters (not symbols). */ -export const URL_NO_COMMAS_REGEX = RegExp('https?://(www\\.)?[\\p{L}0-9][-\\p{L}0-9@:%._\\+~#=]{0,254}[\\p{L}0-9]\\.[a-z]{2,63}(:\\d{1,5})?(/[-\\p{L}0-9@:%_\\+.~#?&//=\\(\\)]*)?', 'giu'); // eslint-disable-line +export const URL_NO_COMMAS_REGEX = RegExp( + 'https?://(www\\.)?[\\p{L}0-9][-\\p{L}0-9@:%._\\+~#=]{0,254}[\\p{L}0-9]\\.[a-z]{2,63}(:\\d{1,5})?(/[-\\p{L}0-9@:%_\\+.~#?&//=\\(\\)]*)?', + 'giu', +); // eslint-disable-line /** * Regular expression that, in addition to the default regular expression `URL_NO_COMMAS_REGEX`, supports matching commas in URL path and query. * Note, however, that this may prevent parsing URLs from comma delimited lists, or the URLs may become malformed. */ -export const URL_WITH_COMMAS_REGEX = RegExp('https?://(www\\.)?[\\p{L}0-9][-\\p{L}0-9@:%._\\+~#=]{0,254}[\\p{L}0-9]\\.[a-z]{2,63}(:\\d{1,5})?(/[-\\p{L}0-9@:%_\\+,.~#?&//=\\(\\)]*)?', 'giu'); // eslint-disable-line +export const URL_WITH_COMMAS_REGEX = RegExp( + 'https?://(www\\.)?[\\p{L}0-9][-\\p{L}0-9@:%._\\+~#=]{0,254}[\\p{L}0-9]\\.[a-z]{2,63}(:\\d{1,5})?(/[-\\p{L}0-9@:%_\\+,.~#?&//=\\(\\)]*)?', + 'giu', +); // eslint-disable-line let isDockerPromiseCache: Promise | undefined; async function createIsDockerPromise() { - const promise1 = fs.stat('/.dockerenv') + const promise1 = fs + .stat('/.dockerenv') .then(() => true) .catch(() => false); - const promise2 = fs.readFile('/proc/self/cgroup', 'utf8') + const promise2 = fs + .readFile('/proc/self/cgroup', 'utf8') .then((content) => content.includes('docker')) .catch(() => false); @@ -44,12 +52,14 @@ export async function isDocker(forceReset?: boolean): Promise { * @ignore */ export function weightedAvg(arrValues: number[], arrWeights: number[]): number { - const result = arrValues.map((value, i) => { - const weight = arrWeights[i]; - const sum = value * weight; + const result = arrValues + .map((value, i) => { + const weight = arrWeights[i]; + const sum = value * weight; - return [sum, weight]; - }).reduce((p, c) => [p[0] + c[0], p[1] + c[1]], [0, 0]); + return [sum, weight]; + }) + .reduce((p, c) => [p[0] + c[0], p[1] + c[1]], [0, 0]); return result[0] / result[1]; } @@ -83,9 +93,7 @@ export function snakeCaseToCamelCase(snakeCaseStr: string): string { .toLowerCase() .split('_') .map((part, index) => { - return index > 0 - ? part.charAt(0).toUpperCase() + part.slice(1) - : part; + return index > 0 ? part.charAt(0).toUpperCase() + part.slice(1) : part; }) .join(''); } diff --git a/packages/utils/src/internals/memory-info.ts b/packages/utils/src/internals/memory-info.ts index f3a98d028318..c4c2f9972e54 100644 --- a/packages/utils/src/internals/memory-info.ts +++ b/packages/utils/src/internals/memory-info.ts @@ -55,10 +55,9 @@ export async function getMemoryInfo(): Promise { // lambda does *not* have `ps` and other command line tools // required to extract memory usage. - const isLambdaEnvironment = process.platform === 'linux' - && !!process.env.AWS_LAMBDA_FUNCTION_MEMORY_SIZE; + const isLambdaEnvironment = process.platform === 'linux' && !!process.env.AWS_LAMBDA_FUNCTION_MEMORY_SIZE; - const isDockerVar = !isLambdaEnvironment && await isDocker(); + const isDockerVar = !isLambdaEnvironment && (await isDocker()); let mainProcessBytes = -1; let childProcessesBytes = 0; @@ -140,9 +139,11 @@ export async function getMemoryInfo(): Promise { freeBytes = totalBytes - usedBytes; } catch (err) { // log.deprecated logs a warning only once - log.deprecated('Your environment is Docker, but your system does not support memory cgroups. ' - + 'If you\'re running containers with limited memory, memory auto-scaling will not work properly.\n\n' - + `Cause: ${(err as Error).message}`); + log.deprecated( + 'Your environment is Docker, but your system does not support memory cgroups. ' + + "If you're running containers with limited memory, memory auto-scaling will not work properly.\n\n" + + `Cause: ${(err as Error).message}`, + ); totalBytes = totalmem(); freeBytes = freemem(); usedBytes = totalBytes - freeBytes; diff --git a/packages/utils/src/internals/open_graph_parser.ts b/packages/utils/src/internals/open_graph_parser.ts index b3e40e813ad3..ad6d8a371cf0 100644 --- a/packages/utils/src/internals/open_graph_parser.ts +++ b/packages/utils/src/internals/open_graph_parser.ts @@ -17,7 +17,8 @@ type OpenGraphResult = string | string[] | Dictionary; * @param item The item to assign to the key. * @returns Either an empty object or an object with the content provided. */ -const optionalSpread = (key: string, item: any) => (item !== undefined && !!Object.values(item)?.length ? { [key]: item } : {}); +const optionalSpread = (key: string, item: any) => + item !== undefined && !!Object.values(item)?.length ? { [key]: item } : {}; const OPEN_GRAPH_PROPERTIES: OpenGraphProperty[] = [ { @@ -371,15 +372,18 @@ const parseOpenGraphProperty = (property: OpenGraphProperty, $: CheerioAPI): str // "Value" is appended to the end of the property name to make it more clear, and to prevent things such // as `videoInfo.actor.actor` to grab the actor's name. ...optionalSpread(`${property.outputName}Value`, content), - ...property.children.reduce((acc, curr) => { - const parsed = parseOpenGraphProperty(curr, $); - if (parsed === undefined) return acc; + ...property.children.reduce( + (acc, curr) => { + const parsed = parseOpenGraphProperty(curr, $); + if (parsed === undefined) return acc; - return { - ...acc, - ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), - }; - }, {} as Dictionary), + return { + ...acc, + ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), + }; + }, + {} as Dictionary, + ), }; }; @@ -396,10 +400,13 @@ export function parseOpenGraph($: CheerioAPI, additionalProperties?: OpenGraphPr export function parseOpenGraph(item: CheerioAPI | string, additionalProperties?: OpenGraphProperty[]) { const $ = typeof item === 'string' ? load(item) : item; - return [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES].reduce((acc, curr) => { - return { - ...acc, - ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), - }; - }, {} as Dictionary); + return [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES].reduce( + (acc, curr) => { + return { + ...acc, + ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)), + }; + }, + {} as Dictionary, + ); } diff --git a/packages/utils/src/internals/robots.ts b/packages/utils/src/internals/robots.ts index 30a69e519468..a7b95881aa70 100644 --- a/packages/utils/src/internals/robots.ts +++ b/packages/utils/src/internals/robots.ts @@ -71,7 +71,17 @@ export class RobotsFile { return new RobotsFile(robotsParser(url.toString(), response.body), proxyUrl); } catch (e) { if (e instanceof HTTPError && e.response.statusCode === 404) { - return new RobotsFile({ isAllowed() { return true; }, getSitemaps() { return []; } }, proxyUrl); + return new RobotsFile( + { + isAllowed() { + return true; + }, + getSitemaps() { + return []; + }, + }, + proxyUrl, + ); } throw e; } diff --git a/packages/utils/src/internals/sitemap.ts b/packages/utils/src/internals/sitemap.ts index 3c9f890307e1..dcf0c7fe5ee2 100644 --- a/packages/utils/src/internals/sitemap.ts +++ b/packages/utils/src/internals/sitemap.ts @@ -24,7 +24,10 @@ class SitemapTxtParser extends Writable { private decoder: StringDecoder = new StringDecoder('utf8'); private buffer: string = ''; - constructor(private parsingState: ParsingState, private onEnd: () => void) { + constructor( + private parsingState: ParsingState, + private onEnd: () => void, + ) { super(); } @@ -74,7 +77,11 @@ class SitemapTxtParser extends Writable { export class Sitemap { constructor(readonly urls: string[]) {} - protected static createXmlParser(parsingState: ParsingState, onEnd: () => void, onError: (error: Error) => void): SAXStream { + protected static createXmlParser( + parsingState: ParsingState, + onEnd: () => void, + onError: (error: Error) => void, + ): SAXStream { const parser = sax.createStream(true); parser.on('opentag', (node) => { diff --git a/packages/utils/src/internals/social.ts b/packages/utils/src/internals/social.ts index d78516f4ba34..6260b945e6ad 100644 --- a/packages/utils/src/internals/social.ts +++ b/packages/utils/src/internals/social.ts @@ -4,7 +4,8 @@ import { htmlToText } from './cheerio'; // Regex inspired by https://zapier.com/blog/extract-links-email-phone-regex/ // eslint-disable-next-line max-len -const EMAIL_REGEX_STRING = '(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\\])'; +const EMAIL_REGEX_STRING = + '(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\\])'; /** * Regular expression to exactly match a single email address. @@ -28,7 +29,7 @@ const EMAIL_URL_PREFIX_REGEX = /^mailto:/i; * If no emails are found, the function returns an empty array. */ export function emailsFromText(text: string): string[] { - if (typeof text as unknown !== 'string') return []; + if ((typeof text as unknown) !== 'string') return []; return text.match(EMAIL_REGEX_GLOBAL) || []; } @@ -122,9 +123,9 @@ const SKIP_PHONE_REGEX = new RegExp(`^(${SKIP_PHONE_REGEXS.join('|')})$`, 'i'); * If no phone numbers are found, the function returns an empty array. */ export function phonesFromText(text: string): string[] { - if (typeof text as unknown !== 'string') return []; + if ((typeof text as unknown) !== 'string') return []; - let phones = text.match(PHONE_REGEX_GLOBAL) as string[] || []; + let phones = (text.match(PHONE_REGEX_GLOBAL) as string[]) || []; phones = phones.filter((phone) => { if (!phone) return false; @@ -167,31 +168,39 @@ export function phonesFromUrls(urls: string[]): string[] { // They are used to prevent matching URLs in strings like "blahttps://www.example.com" // eslint-disable-next-line max-len -const LINKEDIN_REGEX_STRING = '(? discords: [], }; - if (typeof html as unknown !== 'string') return result; + if ((typeof html as unknown) !== 'string') return result; const $ = cheerio.load(html, { decodeEntities: true }); if (data) data.$ = $; @@ -393,22 +402,22 @@ export const INSTAGRAM_REGEX = new RegExp(`^${INSTAGRAM_REGEX_STRING}$`, 'i'); * instagr.am/old_prague * ``` * -* If the profile URL contains subdirectories or query parameters, the regular expression -* extracts just the base part of the profile URL. For example, from text such as: -* ``` -* https://www.instagram.com/cristiano/followers -* ``` -* the expression extracts just the following base URL: -* ``` -* https://www.instagram.com/cristiano -* ``` -* -* The regular expression does NOT match the following URLs: -* ``` -* https://www.instagram.com/explore/ -* https://www.instagram.com/_n/ -* https://www.instagram.com/_u/ -* ``` + * If the profile URL contains subdirectories or query parameters, the regular expression + * extracts just the base part of the profile URL. For example, from text such as: + * ``` + * https://www.instagram.com/cristiano/followers + * ``` + * the expression extracts just the following base URL: + * ``` + * https://www.instagram.com/cristiano + * ``` + * + * The regular expression does NOT match the following URLs: + * ``` + * https://www.instagram.com/explore/ + * https://www.instagram.com/_n/ + * https://www.instagram.com/_u/ + * ``` * * Example usage: * ``` diff --git a/packages/utils/test/robots.test.ts b/packages/utils/test/robots.test.ts index 3faee2e90c74..7ed14a590173 100644 --- a/packages/utils/test/robots.test.ts +++ b/packages/utils/test/robots.test.ts @@ -6,23 +6,27 @@ import { RobotsFile } from '../src/internals/robots'; describe('RobotsFile', () => { beforeEach(() => { nock.disableNetConnect(); - nock('http://not-exists.com').persist() + nock('http://not-exists.com') + .persist() .get('/robots.txt') - .reply(200, [ - 'User-agent: *', - 'Disallow: *deny_all/', - 'crawl-delay: 10', + .reply( + 200, + [ + 'User-agent: *', + 'Disallow: *deny_all/', + 'crawl-delay: 10', - 'User-agent: Googlebot', - 'Disallow: *deny_googlebot/', - 'crawl-delay: 1', + 'User-agent: Googlebot', + 'Disallow: *deny_googlebot/', + 'crawl-delay: 1', - 'user-agent: Mozilla', - 'crawl-delay: 2', + 'user-agent: Mozilla', + 'crawl-delay: 2', - 'sitemap: http://not-exists.com/sitemap_1.xml', - 'sitemap: http://not-exists.com/sitemap_2.xml', - ].join('\n')) + 'sitemap: http://not-exists.com/sitemap_1.xml', + 'sitemap: http://not-exists.com/sitemap_2.xml', + ].join('\n'), + ) .get('*') .reply(404); }); @@ -46,7 +50,10 @@ describe('RobotsFile', () => { it('extracts sitemap urls', async () => { const robots = await RobotsFile.find('http://not-exists.com/robots.txt'); - expect(robots.getSitemaps()).toEqual(['http://not-exists.com/sitemap_1.xml', 'http://not-exists.com/sitemap_2.xml']); + expect(robots.getSitemaps()).toEqual([ + 'http://not-exists.com/sitemap_1.xml', + 'http://not-exists.com/sitemap_2.xml', + ]); }); it('parses allow/deny directives from explicitly provided robots.txt contents', async () => { diff --git a/packages/utils/test/sitemap.test.ts b/packages/utils/test/sitemap.test.ts index a766fec6c6af..b5a8de001a47 100644 --- a/packages/utils/test/sitemap.test.ts +++ b/packages/utils/test/sitemap.test.ts @@ -6,89 +6,117 @@ import { Sitemap } from '../src/internals/sitemap'; describe('Sitemap', () => { beforeEach(() => { nock.disableNetConnect(); - nock('http://not-exists.com').persist() + nock('http://not-exists.com') + .persist() .get('/sitemap_child.xml') - .reply(200, [ - '', - '', - '', - 'http://not-exists.com/', - '2005-01-01', - 'monthly', - '0.8', - '', - '', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'weekly', - '', - '', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - '2004-12-23', - 'weekly', - '', - '', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - '2004-12-23T18:00:15+00:00', - '0.3', - '', - '', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', - '2004-11-23', - '', - '', - ].join('\n')) + .reply( + 200, + [ + '', + '', + '', + 'http://not-exists.com/', + '2005-01-01', + 'monthly', + '0.8', + '', + '', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'weekly', + '', + '', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + '2004-12-23', + 'weekly', + '', + '', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + '2004-12-23T18:00:15+00:00', + '0.3', + '', + '', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + '2004-11-23', + '', + '', + ].join('\n'), + ) .get('/sitemap_child.xml.gz') - .reply(200, Buffer.from([ - 'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv', - '3y/fKSlPXRsehbFSqwLgGIFQKK4rqeoCPO0eowyUNCCDaa1woR9WtgCNc30O4TiOsZVOdKy3sTY1', - 'tLzxiYVzEaL4HkzLPraa03lRaReJk7TOxlx3kMBLz08w6zpd0QShbYSwf74z1wLCG6ZqcTDihXZa', - 'uaY9E7ioBaQ3UhvpzhTFGYEfWUDgBHANgzPHWl2XF/gCJzes6x8qYXlxZL7l/dk3bGRSvuMuxEch', - 'nr/w/Eb2Ll2RVWLcvwrWMlWtWLWJcBIl6TdW/R/ZZp3soAdV/Yy2w1mOUI63tz4itCRd3Cz9882y', - 'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==', - ].join('\n'), 'base64')) + .reply( + 200, + Buffer.from( + [ + 'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv', + '3y/fKSlPXRsehbFSqwLgGIFQKK4rqeoCPO0eowyUNCCDaa1woR9WtgCNc30O4TiOsZVOdKy3sTY1', + 'tLzxiYVzEaL4HkzLPraa03lRaReJk7TOxlx3kMBLz08w6zpd0QShbYSwf74z1wLCG6ZqcTDihXZa', + 'uaY9E7ioBaQ3UhvpzhTFGYEfWUDgBHANgzPHWl2XF/gCJzes6x8qYXlxZL7l/dk3bGRSvuMuxEch', + 'nr/w/Eb2Ll2RVWLcvwrWMlWtWLWJcBIl6TdW/R/ZZp3soAdV/Yy2w1mOUI63tz4itCRd3Cz9882y', + 'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==', + ].join('\n'), + 'base64', + ), + ) .get('/invalid_sitemap_child.xml.gz') - .reply(200, Buffer.from([ - 'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv', - 'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==', - ].join('\n'), 'base64')) + .reply( + 200, + Buffer.from( + [ + 'H4sIAAAAAAAAA62S306DMBTG73kK0gtvDLSFLSKWcucTzOulKR00QottGZtPbxfQEEWXqElzkvMv', + 'NfMGy9bJ8CfTZkU4fXUavAGtDs17GwMAAA==', + ].join('\n'), + 'base64', + ), + ) .get('/sitemap_parent.xml') - .reply(200, [ - '', - '', - '', - 'http://not-exists.com/sitemap_child.xml', - '2004-12-23', - '', - '', - ].join('\n')) + .reply( + 200, + [ + '', + '', + '', + 'http://not-exists.com/sitemap_child.xml', + '2004-12-23', + '', + '', + ].join('\n'), + ) .get('/not_actual_xml.xml') - .reply(200, [ - '', - '301 Moved', - '

301 Moved

', - 'The document has moved', - 'here.', - '', - ].join('\n')) + .reply( + 200, + [ + '', + '301 Moved', + '

301 Moved

', + 'The document has moved', + 'here.', + '', + ].join('\n'), + ) .get('/sitemap.xml') - .reply(200, [ - '', - '', - '', - 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', - '2004-11-23', - '', - '', - 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', - '2004-11-23', - '', - '', - ].join('\n')) + .reply( + 200, + [ + '', + '', + '', + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + '2004-11-23', + '', + '', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + '2004-11-23', + '', + '', + ].join('\n'), + ) .get('/sitemap.txt') - .reply(200, [ - 'http://not-exists.com/catalog?item=78&desc=vacation_crete', - 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', - ].join('\n')) + .reply( + 200, + [ + 'http://not-exists.com/catalog?item=78&desc=vacation_crete', + 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + ].join('\n'), + ) .get('*') .reply(404); }); @@ -100,42 +128,46 @@ describe('Sitemap', () => { it('extracts urls from sitemaps', async () => { const sitemap = await Sitemap.load('http://not-exists.com/sitemap_child.xml'); - expect(new Set(sitemap.urls)).toEqual(new Set([ - 'http://not-exists.com/', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', - ])); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); }); it('extracts urls from gzipped sitemaps', async () => { const sitemap = await Sitemap.load('http://not-exists.com/sitemap_child.xml.gz'); - expect(new Set(sitemap.urls)).toEqual(new Set([ - 'http://not-exists.com/', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', - ])); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); }); it('identifies incorrect gzipped sitemaps as malformed', async () => { - const sitemap = await Sitemap.load( - 'http://not-exists.com/invalid_sitemap_child.xml.gz', - ); + const sitemap = await Sitemap.load('http://not-exists.com/invalid_sitemap_child.xml.gz'); expect(new Set(sitemap.urls)).toEqual(new Set([])); }); it('follows links in sitemap indexes', async () => { const sitemap = await Sitemap.load('http://not-exists.com/sitemap_parent.xml'); - expect(new Set(sitemap.urls)).toEqual(new Set([ - 'http://not-exists.com/', - 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', - 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', - 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', - 'http://not-exists.com/catalog?item=83&desc=vacation_usa', - ])); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/', + 'http://not-exists.com/catalog?item=12&desc=vacation_hawaii', + 'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand', + 'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland', + 'http://not-exists.com/catalog?item=83&desc=vacation_usa', + ]), + ); }); it('does not break on invalid xml', async () => { @@ -145,19 +177,23 @@ describe('Sitemap', () => { it('autodetects sitemaps', async () => { const sitemap = await Sitemap.tryCommonNames('http://not-exists.com/arbitrary_url?search=xyz'); - expect(new Set(sitemap.urls)).toEqual(new Set([ - 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', - 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', - 'http://not-exists.com/catalog?item=78&desc=vacation_crete', - 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', - ])); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/catalog?item=80&desc=vacation_turkey', + 'http://not-exists.com/catalog?item=81&desc=vacation_maledives', + 'http://not-exists.com/catalog?item=78&desc=vacation_crete', + 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + ]), + ); }); it('handles sitemap.txt correctly', async () => { const sitemap = await Sitemap.load('http://not-exists.com/sitemap.txt'); - expect(new Set(sitemap.urls)).toEqual(new Set([ - 'http://not-exists.com/catalog?item=78&desc=vacation_crete', - 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', - ])); + expect(new Set(sitemap.urls)).toEqual( + new Set([ + 'http://not-exists.com/catalog?item=78&desc=vacation_crete', + 'http://not-exists.com/catalog?item=79&desc=vacation_somalia', + ]), + ); }); }); diff --git a/packages/utils/tsconfig.build.json b/packages/utils/tsconfig.build.json index 4c23c1922004..e71691608ee3 100644 --- a/packages/utils/tsconfig.build.json +++ b/packages/utils/tsconfig.build.json @@ -1,8 +1,8 @@ { - "extends": "../../tsconfig.build.json", - "compilerOptions": { - "outDir": "./dist" - }, - "include": ["src/**/*"], - "rootDir": "./src" + "extends": "../../tsconfig.build.json", + "compilerOptions": { + "outDir": "./dist" + }, + "include": ["src/**/*"], + "rootDir": "./src" } diff --git a/renovate.json b/renovate.json index e6c5509c4bd0..5b3924d724c7 100644 --- a/renovate.json +++ b/renovate.json @@ -1,41 +1,28 @@ { - "extends": [ - "config:base", - ":semanticCommitTypeAll(chore)" - ], - "pinVersions": false, - "separateMajorMinor": false, - "dependencyDashboard": false, - "semanticCommits": "enabled", - "lockFileMaintenance": { - "enabled": true, - "schedule": [ - "before 2am" - ], - "automerge": true, - "automergeType": "branch" - }, - "constraints": { - "npm": "^8.0.0" - }, - "packageRules": [ - { - "matchUpdateTypes": [ - "patch", - "minor" - ], - "matchCurrentVersion": "!/^0/", - "groupName": "patch/minor dependencies", - "groupSlug": "all-non-major", - "automerge": true, - "automergeType": "branch" - } - ], - "schedule": [ - "every weekday" - ], - "ignoreDeps": [ - "crawlee", - "docusaurus-plugin-typedoc-api" - ] + "extends": ["config:base", ":semanticCommitTypeAll(chore)"], + "pinVersions": false, + "separateMajorMinor": false, + "dependencyDashboard": false, + "semanticCommits": "enabled", + "lockFileMaintenance": { + "enabled": true, + "schedule": ["before 2am"], + "automerge": true, + "automergeType": "branch" + }, + "constraints": { + "npm": "^8.0.0" + }, + "packageRules": [ + { + "matchUpdateTypes": ["patch", "minor"], + "matchCurrentVersion": "!/^0/", + "groupName": "patch/minor dependencies", + "groupSlug": "all-non-major", + "automerge": true, + "automergeType": "branch" + } + ], + "schedule": ["every weekday"], + "ignoreDeps": ["crawlee", "docusaurus-plugin-typedoc-api"] } diff --git a/scripts/actions/docker-images/main.ts b/scripts/actions/docker-images/main.ts index 4410c4055ff9..28fcced4b85f 100644 --- a/scripts/actions/docker-images/main.ts +++ b/scripts/actions/docker-images/main.ts @@ -36,8 +36,9 @@ const crawleeVersion: string = JSON.parse( ).version; debug(`Crawlee version: ${crawleeVersion}`); -const apifyVersion: string = JSON.parse(await readFile(new URL('../../../package.json', import.meta.url), 'utf-8')).devDependencies?.apify - ?? 'latest'; +const apifyVersion: string = + JSON.parse(await readFile(new URL('../../../package.json', import.meta.url), 'utf-8')).devDependencies?.apify ?? + 'latest'; debug(`Apify version: ${apifyVersion}`); const lastPlaywrightVersions = await fetchModuleVersions('playwright', 5); @@ -76,9 +77,7 @@ if (process.env.CRAWLEE_BETA_VERSION) { newState.puppeteerVersions = state.puppeteerVersions; } else if (process.env.TRIGGER_LATEST === 'true') { info(`👀 Re-triggering a full Docker build`); - debug( - ` Crawlee:${crawleeVersion} Apify:${apifyVersion}`, - ); + debug(` Crawlee:${crawleeVersion} Apify:${apifyVersion}`); // Keep the old state in place newState.playwrightVersions = state.playwrightVersions; @@ -165,7 +164,9 @@ if (process.env.CRAWLEE_BETA_VERSION) { } } else { for (const [index, newPuppeteerVersion] of lastPuppeteerVersions.entries()) { - info(` 👀 Scheduling build for puppeteer: ${newPuppeteerVersion} and crawlee ${crawleeVersion} for deploy`); + info( + ` 👀 Scheduling build for puppeteer: ${newPuppeteerVersion} and crawlee ${crawleeVersion} for deploy`, + ); apiCalls.push({ eventType: EventType.Puppeteer, diff --git a/scripts/actions/docker-images/state.json b/scripts/actions/docker-images/state.json index cd0044955b32..61130bea85d9 100644 --- a/scripts/actions/docker-images/state.json +++ b/scripts/actions/docker-images/state.json @@ -1,17 +1,5 @@ { - "playwrightVersions": [ - "1.40.1", - "1.41.0", - "1.41.1", - "1.41.2", - "1.42.0" - ], - "puppeteerVersions": [ - "21.11.0", - "22.0.0", - "22.1.0", - "22.2.0", - "22.3.0" - ], + "playwrightVersions": ["1.40.1", "1.41.0", "1.41.1", "1.41.2", "1.42.0"], + "puppeteerVersions": ["21.11.0", "22.0.0", "22.1.0", "22.2.0", "22.3.0"], "crawleeVersion": "3.8.1" -} \ No newline at end of file +} diff --git a/scripts/copy.ts b/scripts/copy.ts index f79958c4cf1d..a5fe0c35642b 100644 --- a/scripts/copy.ts +++ b/scripts/copy.ts @@ -62,7 +62,9 @@ function getNextVersion() { if (versions.some((v) => v === version)) { // eslint-disable-next-line no-console - console.error(`before-deploy: A release with version ${version} already exists. Please increment version accordingly.`); + console.error( + `before-deploy: A release with version ${version} already exists. Please increment version accordingly.`, + ); process.exit(1); } diff --git a/scripts/typescript_fixes.mjs b/scripts/typescript_fixes.mjs index c5585ac0ddc3..d8b9ca56eb34 100644 --- a/scripts/typescript_fixes.mjs +++ b/scripts/typescript_fixes.mjs @@ -13,20 +13,20 @@ for (const filepath of files) { for (const line of input) { /* eslint-disable no-cond-assign */ - if (match = line.match(/^([^']+)'node\/([^$]+)/)) { + if ((match = line.match(/^([^']+)'node\/([^$]+)/))) { output.push(`${match[1]} '${match[2]}`); changed = true; } else if ( // playwright/puppeteer/got-scraping import - line.match(/^([^']+)'(playwright|puppeteer|got-scraping)'/) + line.match(/^([^']+)'(playwright|puppeteer|got-scraping)'/) || // proxy-per-page reexport of puppeteer - || line.match(/: Puppeteer\.\w+/) + line.match(/: Puppeteer\.\w+/) || // don't ask me why, but this one is needed too ¯\_(ツ)_/¯ - || line.match(/^export interface (PlaywrightHook|PuppeteerHook)/) + line.match(/^export interface (PlaywrightHook|PuppeteerHook)/) || // /// from newer nodenext resolutions - || line.match(/^\/\/\/ /) + line.match(/^\/\/\/ /) || // import("something") from compatibility with ES2022 module -.- - || line.match(/import\("([^"]+)"(?:.*)?\)/) + line.match(/import\("([^"]+)"(?:.*)?\)/) ) { output.push('// @ts-ignore optional peer dependency or compatibility with es2022'); output.push(line); diff --git a/test/browser-pool/browser-plugins/plugins.test.ts b/test/browser-pool/browser-plugins/plugins.test.ts index e47c61319fa8..08275a3b6893 100644 --- a/test/browser-pool/browser-plugins/plugins.test.ts +++ b/test/browser-pool/browser-plugins/plugins.test.ts @@ -3,7 +3,14 @@ import http from 'http'; import type { AddressInfo } from 'net'; import { promisify } from 'util'; -import { PuppeteerPlugin, PlaywrightPlugin, PuppeteerController, PlaywrightController, PlaywrightBrowser, LaunchContext } from '@crawlee/browser-pool'; +import { + PuppeteerPlugin, + PlaywrightPlugin, + PuppeteerController, + PlaywrightController, + PlaywrightBrowser, + LaunchContext, +} from '@crawlee/browser-pool'; import type { UnwrapPromise, CommonLibrary } from '@crawlee/browser-pool'; import playwright from 'playwright'; import type { Server as ProxyChainServer } from 'proxy-chain'; @@ -32,11 +39,15 @@ const runPluginTest = < P extends typeof PlaywrightPlugin | typeof PuppeteerPlugin, C extends typeof PuppeteerController | typeof PlaywrightController, L extends CommonLibrary, ->(Plugin: P, Controller: C, library: L) => { +>( + Plugin: P, + Controller: C, + library: L, +) => { let plugin = new Plugin(library as never); describe(`${plugin.constructor.name} - ${'name' in library ? library.name!() : ''} general`, () => { - let browser: playwright.Browser | UnwrapPromise> | undefined; + let browser: playwright.Browser | UnwrapPromise> | undefined; beforeEach(() => { plugin = new Plugin(library as never); @@ -120,7 +131,9 @@ const runPluginTest = < browserController.activate(); const page = await browserController.newPage(); - await browserController.setCookies(page as never, [{ name: 'TEST', value: 'TESTER-COOKIE', url: serverAddress }]); + await browserController.setCookies(page as never, [ + { name: 'TEST', value: 'TESTER-COOKIE', url: serverAddress }, + ]); await page.goto(serverAddress, { waitUntil: 'domcontentloaded' }); const cookies = await browserController.getCookies(page as never); @@ -145,7 +158,9 @@ const runPluginTest = < expect(false).toBe(true); } catch (error: any) { - expect(error.message).toBe('A new page can be created with provided context only when using incognito pages or experimental containers.'); + expect(error.message).toBe( + 'A new page can be created with provided context only when using incognito pages or experimental containers.', + ); } }); }); @@ -279,9 +294,7 @@ describe('Plugins', () => { const userAgent = 'HelloWorld'; const launchOptions = { - args: [ - `--user-agent=${userAgent}`, - ], + args: [`--user-agent=${userAgent}`], }; const launchContext = plugin.createLaunchContext({ launchOptions }); @@ -334,14 +347,17 @@ describe('Plugins', () => { const proxyUrl = `http://127.0.0.2:${unprotectedProxy.port}`; const plugin = new PlaywrightPlugin(playwright[browserName]); - const launchOptions = browserName === 'chromium' ? { - args: [ - // Exclude loopback interface from proxy bypass list, - // so the request to localhost goes through proxy. - // This way there's no need for a 3rd party server. - '--proxy-bypass-list=<-loopback>', - ], - } : undefined; + const launchOptions = + browserName === 'chromium' + ? { + args: [ + // Exclude loopback interface from proxy bypass list, + // so the request to localhost goes through proxy. + // This way there's no need for a 3rd party server. + '--proxy-bypass-list=<-loopback>', + ], + } + : undefined; const context = plugin.createLaunchContext({ proxyUrl, @@ -364,14 +380,17 @@ describe('Plugins', () => { const proxyUrl = `http://foo:bar@127.0.0.3:${protectedProxy.port}`; const plugin = new PlaywrightPlugin(playwright[browserName]); - const launchOptions = browserName === 'chromium' ? { - args: [ - // Exclude loopback interface from proxy bypass list, - // so the request to localhost goes through proxy. - // This way there's no need for a 3rd party server. - '--proxy-bypass-list=<-loopback>', - ], - } : undefined; + const launchOptions = + browserName === 'chromium' + ? { + args: [ + // Exclude loopback interface from proxy bypass list, + // so the request to localhost goes through proxy. + // This way there's no need for a 3rd party server. + '--proxy-bypass-list=<-loopback>', + ], + } + : undefined; const context = plugin.createLaunchContext({ proxyUrl, @@ -558,14 +577,21 @@ describe('Plugins', () => { const launchContext = plugin.createLaunchContext(); browser = await plugin.launch(launchContext); - await expect(browser.newContext()) - .rejects - .toThrow('Function `newContext()` is not available in incognito mode'); + await expect(browser.newContext()).rejects.toThrow( + 'Function `newContext()` is not available in incognito mode', + ); }); test('should have same public interface as playwright browserType', async () => { const plugin = new PlaywrightPlugin(playwright[browserName]); - const originalFunctionNames = ['close', 'contexts', 'isConnected', 'newContext', 'newPage', 'version'] as const; + const originalFunctionNames = [ + 'close', + 'contexts', + 'isConnected', + 'newContext', + 'newPage', + 'version', + ] as const; const launchContext = plugin.createLaunchContext({ useIncognitoPages: true }); browser = await plugin.launch(launchContext); diff --git a/test/browser-pool/browser-pool.test.ts b/test/browser-pool/browser-pool.test.ts index 0240d0799c30..8dca9a6b455a 100644 --- a/test/browser-pool/browser-pool.test.ts +++ b/test/browser-pool/browser-pool.test.ts @@ -20,39 +20,27 @@ import { PuppeteerPlugin } from '../../packages/browser-pool/src/puppeteer/puppe const fingerprintingMatrix: [string, PlaywrightPlugin | PuppeteerPlugin][] = [ [ 'Playwright - persistent', - new PlaywrightPlugin( - playwright.chromium, - { - useIncognitoPages: false, - }, - ), + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: false, + }), ], [ 'Playwright - Incognito', - new PlaywrightPlugin( - playwright.chromium, - { - useIncognitoPages: true, - }, - ), + new PlaywrightPlugin(playwright.chromium, { + useIncognitoPages: true, + }), ], [ 'Puppeteer - Persistent', - new PuppeteerPlugin( - puppeteer, - { - useIncognitoPages: false, - }, - ), + new PuppeteerPlugin(puppeteer, { + useIncognitoPages: false, + }), ], [ 'Puppeteer - Incognito', - new PuppeteerPlugin( - puppeteer, - { - useIncognitoPages: true, - }, - ), + new PuppeteerPlugin(puppeteer, { + useIncognitoPages: true, + }), ], ]; // Tests could be generated from this blueprint for each plugin @@ -141,11 +129,9 @@ describe.each([ expect(spy).toBeCalledTimes(4); spy.mockReset(); - await expect(addTimeoutToPromise( - async () => browserPool.newPage(), - 10, - 'opening new page timed out', - )).rejects.toThrowError('opening new page timed out'); + await expect( + addTimeoutToPromise(async () => browserPool.newPage(), 10, 'opening new page timed out'), + ).rejects.toThrowError('opening new page timed out'); // We terminated early enough so only preLaunchHooks were not executed, // thanks to `tryCancel()` calls after each await. If we did not run @@ -240,10 +226,7 @@ describe.each([ browserPlugin: plugin, }; - await Promise.all([ - browserPool.newPage(usePlugin), - browserPool.newPage(usePlugin), - ]); + await Promise.all([browserPool.newPage(usePlugin), browserPool.newPage(usePlugin)]); expect(browserPool.activeBrowserControllers.size).toBe(2); @@ -271,9 +254,11 @@ describe.each([ expect(browserPool.retiredBrowserControllers.size).toBe(1); await page.close(); - await new Promise((resolve) => setTimeout(() => { - resolve(); - }, 1000)); + await new Promise((resolve) => + setTimeout(() => { + resolve(); + }, 1000), + ); expect(browserPool['_closeRetiredBrowserWithNoPages']).toHaveBeenCalled(); expect(controller.close).toHaveBeenCalled(); @@ -309,7 +294,12 @@ describe.each([ const page = await browserPool.newPage(); const pageId = browserPool.getPageId(page)!; const { launchContext } = browserPool.getBrowserControllerByPage(page)!; - expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith(1, browserPool.preLaunchHooks, pageId, launchContext); + expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith( + 1, + browserPool.preLaunchHooks, + pageId, + launchContext, + ); }); // We had a problem where if the first newPage() call, which launches @@ -318,7 +308,9 @@ describe.each([ test('error in hook does not leave browser stuck in limbo', async () => { const errorMessage = 'pre-launch failed'; browserPool.preLaunchHooks = [ - async () => { throw new Error(errorMessage); }, + async () => { + throw new Error(errorMessage); + }, ]; const attempts = 5; @@ -347,8 +339,12 @@ describe.each([ const pageId = browserPool.getPageId(page)!; const browserController = browserPool.getBrowserControllerByPage(page)!; - expect(browserPool['_executeHooks']) - .toHaveBeenNthCalledWith(2, browserPool.postLaunchHooks, pageId, browserController); + expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith( + 2, + browserPool.postLaunchHooks, + pageId, + browserController, + ); }); // We had a problem where if the first newPage() call, which launches @@ -423,7 +419,12 @@ describe.each([ const page = await browserPool.newPage(); const browserController = browserPool.getBrowserControllerByPage(page); - expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith(4, browserPool.postPageCreateHooks, page, browserController); + expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith( + 4, + browserPool.postPageCreateHooks, + page, + browserController, + ); }); }); @@ -439,7 +440,12 @@ describe.each([ await page.close(); const browserController = browserPool.getBrowserControllerByPage(page); - expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith(5, browserPool.prePageCloseHooks, page, browserController); + expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith( + 5, + browserPool.prePageCloseHooks, + page, + browserController, + ); }); }); @@ -456,7 +462,12 @@ describe.each([ await page.close(); const browserController = browserPool.getBrowserControllerByPage(page); - expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith(6, browserPool.postPageCloseHooks, pageId, browserController); + expect(browserPool['_executeHooks']).toHaveBeenNthCalledWith( + 6, + browserPool.postPageCloseHooks, + pageId, + browserController, + ); }); }); @@ -522,7 +533,8 @@ describe.each([ }; }); // @ts-expect-error mistypings - const { fingerprint } = browserController!.launchContext!.fingerprint as BrowserFingerprintWithHeaders; + const { fingerprint } = browserController!.launchContext! + .fingerprint as BrowserFingerprintWithHeaders; expect(data.hardwareConcurrency).toBe(fingerprint?.navigator.hardwareConcurrency); expect(data.userAgent).toBe(fingerprint?.navigator.userAgent); @@ -540,12 +552,11 @@ describe.each([ describe('caching', () => { const commonOptions = { - browserPlugins: [new PlaywrightPlugin( - playwright.chromium, - { + browserPlugins: [ + new PlaywrightPlugin(playwright.chromium, { useIncognitoPages: true, - }, - )], + }), + ], }; let browserPoolCache: BrowserPool; @@ -611,12 +622,11 @@ describe.each([ }); describe('generator configuration', () => { const commonOptions = { - browserPlugins: [new PlaywrightPlugin( - playwright.firefox, - { + browserPlugins: [ + new PlaywrightPlugin(playwright.firefox, { useIncognitoPages: true, - }, - )], + }), + ], }; let browserPoolConfig: BrowserPool; afterEach(async () => { diff --git a/test/core/autoscaling/autoscaled_pool.test.ts b/test/core/autoscaling/autoscaled_pool.test.ts index e1bad5826527..7855491960e6 100644 --- a/test/core/autoscaling/autoscaled_pool.test.ts +++ b/test/core/autoscaling/autoscaled_pool.test.ts @@ -322,7 +322,9 @@ describe('AutoscaledPool', () => { let count = 0; const pool = new AutoscaledPool({ maxConcurrency: 1, - runTaskFunction: async () => { count++; }, + runTaskFunction: async () => { + count++; + }, isFinishedFunction: async () => false, isTaskReadyFunction: async () => { if (count > 1) throw new Error('some-ready-error'); @@ -341,9 +343,12 @@ describe('AutoscaledPool', () => { // Run the pool and close it after 3s. const pool = new AutoscaledPool({ minConcurrency: 3, - runTaskFunction: async () => sleep(1).then(() => { count++; }), + runTaskFunction: async () => + sleep(1).then(() => { + count++; + }), isFinishedFunction: isFinished, - isTaskReadyFunction: async () => !await isFinished(), + isTaskReadyFunction: async () => !(await isFinished()), }); // @ts-expect-error Overwriting readonly private prop @@ -366,8 +371,16 @@ describe('AutoscaledPool', () => { maxConcurrency: 1, runTaskFunction: async () => { await sleep(1); - if (counter === 10) { isTaskReady = false; setTimeout(() => { isTaskReady = true; }, 10); } - if (counter === 19) { isTaskReady = false; isFinished = true; } + if (counter === 10) { + isTaskReady = false; + setTimeout(() => { + isTaskReady = true; + }, 10); + } + if (counter === 19) { + isTaskReady = false; + isFinished = true; + } counter++; finished.push(Date.now()); }, @@ -409,7 +422,10 @@ describe('AutoscaledPool', () => { return null; } }, - isFinishedFunction: async () => { finished = true; return true; }, + isFinishedFunction: async () => { + finished = true; + return true; + }, isTaskReadyFunction: async () => !aborted, }); await pool.run(); @@ -445,7 +461,9 @@ describe('AutoscaledPool', () => { let count = 0; const results: number[] = []; let pauseResolve: (value: unknown) => void; - const pausePromise = new Promise((res) => { pauseResolve = res; }); + const pausePromise = new Promise((res) => { + pauseResolve = res; + }); const pool = new AutoscaledPool({ maybeRunIntervalSecs: 0.01, @@ -462,7 +480,9 @@ describe('AutoscaledPool', () => { let finished = false; const runPromise = pool.run(); - void runPromise.then(() => { finished = true; }); + void runPromise.then(() => { + finished = true; + }); await pausePromise; expect(count).toBe(20); expect(finished).toBe(false); diff --git a/test/core/autoscaling/snapshotter.test.ts b/test/core/autoscaling/snapshotter.test.ts index ddaf2941832a..5976b27702fe 100644 --- a/test/core/autoscaling/snapshotter.test.ts +++ b/test/core/autoscaling/snapshotter.test.ts @@ -125,12 +125,14 @@ describe('Snapshotter', () => { test('correctly marks CPU overloaded using OS metrics', async () => { const cpusMock = vitest.spyOn(os, 'cpus'); - const fakeCpu = [{ - times: { - idle: 0, - other: 0, + const fakeCpu = [ + { + times: { + idle: 0, + other: 0, + }, }, - }]; + ]; const { times } = fakeCpu[0]; cpusMock.mockReturnValue(fakeCpu as any); @@ -211,7 +213,9 @@ describe('Snapshotter', () => { } as MemoryInfo; const getMemoryInfo = async () => ({ ...memoryData }); vitest.spyOn(LocalEventManager.prototype as any, '_getMemoryInfo').mockImplementation(getMemoryInfo); - vitest.spyOn(Snapshotter.prototype as any, '_getMemoryInfo').mockResolvedValueOnce({ totalBytes: toBytes(10000) }); + vitest + .spyOn(Snapshotter.prototype as any, '_getMemoryInfo') + .mockResolvedValueOnce({ totalBytes: toBytes(10000) }); const config = new Configuration({ availableMemoryRatio: 1 }); const snapshotter = new Snapshotter({ config, maxUsedMemoryRatio: 0.5 }); @@ -243,7 +247,9 @@ describe('Snapshotter', () => { }); test('correctly logs critical memory overload', async () => { - vitest.spyOn(Snapshotter.prototype as any, '_getMemoryInfo').mockResolvedValueOnce({ totalBytes: toBytes(10000) }); + vitest + .spyOn(Snapshotter.prototype as any, '_getMemoryInfo') + .mockResolvedValueOnce({ totalBytes: toBytes(10000) }); const config = new Configuration({ availableMemoryRatio: 1 }); const snapshotter = new Snapshotter({ config, maxUsedMemoryRatio: 0.5 }); await snapshotter.start(); @@ -322,9 +328,11 @@ describe('Snapshotter', () => { const snapshot = eventLoopSnapshots[eventLoopSnapshots.length - 1 - i]; expect(sample).toEqual(snapshot); } - const diffBetween = eventLoopSample[eventLoopSample.length - 1].createdAt.getTime() - - eventLoopSnapshots[eventLoopSnapshots.length - 1].createdAt.getTime(); - const diffWithin = eventLoopSample[0].createdAt.getTime() - eventLoopSample[eventLoopSample.length - 1].createdAt.getTime(); + const diffBetween = + eventLoopSample[eventLoopSample.length - 1].createdAt.getTime() - + eventLoopSnapshots[eventLoopSnapshots.length - 1].createdAt.getTime(); + const diffWithin = + eventLoopSample[0].createdAt.getTime() - eventLoopSample[eventLoopSample.length - 1].createdAt.getTime(); expect(diffBetween).toBeLessThan(SAMPLE_SIZE_MILLIS); expect(diffWithin).toBeLessThan(SAMPLE_SIZE_MILLIS); }); diff --git a/test/core/autoscaling/system_status.test.ts b/test/core/autoscaling/system_status.test.ts index 4afe7b866591..19f78517afbf 100644 --- a/test/core/autoscaling/system_status.test.ts +++ b/test/core/autoscaling/system_status.test.ts @@ -13,7 +13,12 @@ describe('SystemStatus', () => { }); class MockSnapshotter { - constructor(readonly memSnapshots: any[], readonly loopSnapshots: any[], readonly cpuSnapshots: any[], readonly clientSnapshots: any[]) {} + constructor( + readonly memSnapshots: any[], + readonly loopSnapshots: any[], + readonly cpuSnapshots: any[], + readonly clientSnapshots: any[], + ) {} getMemorySample(offset: number) { return this.memSnapshots.slice(-offset); diff --git a/test/core/crawlers/adaptive_playwright_crawler.test.ts b/test/core/crawlers/adaptive_playwright_crawler.test.ts index 489ad2b3307a..a563ca16761f 100644 --- a/test/core/crawlers/adaptive_playwright_crawler.test.ts +++ b/test/core/crawlers/adaptive_playwright_crawler.test.ts @@ -3,9 +3,7 @@ import type { AddressInfo } from 'net'; import { KeyValueStore } from '@crawlee/core'; import type { AdaptivePlaywrightCrawlerOptions } from '@crawlee/playwright'; -import { - AdaptivePlaywrightCrawler, RequestList, -} from '@crawlee/playwright'; +import { AdaptivePlaywrightCrawler, RequestList } from '@crawlee/playwright'; import express from 'express'; import { startExpressAppPromise } from 'test/shared/_helper'; import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator'; @@ -81,32 +79,45 @@ describe('AdaptivePlaywrightCrawler', () => { // Test setup helpers const makeOneshotCrawler = async ( - options: Required> & Partial, + options: Required> & + Partial, sources: string[], - ) => new AdaptivePlaywrightCrawler({ - renderingTypeDetectionRatio: 0.1, - maxConcurrency: 1, - maxRequestRetries: 0, - maxRequestsPerCrawl: 1, - requestList: await RequestList.open({ sources }), - ...options, - }); + ) => + new AdaptivePlaywrightCrawler({ + renderingTypeDetectionRatio: 0.1, + maxConcurrency: 1, + maxRequestRetries: 0, + maxRequestsPerCrawl: 1, + requestList: await RequestList.open({ sources }), + ...options, + }); - const makeRiggedRenderingTypePredictor = (prediction: {detectionProbabilityRecommendation: number; renderingType: 'clientOnly' | 'static'}) => ({ + const makeRiggedRenderingTypePredictor = (prediction: { + detectionProbabilityRecommendation: number; + renderingType: 'clientOnly' | 'static'; + }) => ({ predict: vi.fn((_url: URL) => prediction), storeResult: vi.fn((_url: URL, _label: string | unknown, _renderingType: string) => {}), }); describe('should detect page rendering type', () => { - test.each([['/static', 'static'], ['/dynamic', 'clientOnly']] as const)('for %s', async (path, expectedType) => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 1, renderingType: 'clientOnly' }); + test.each([ + ['/static', 'static'], + ['/dynamic', 'clientOnly'], + ] as const)('for %s', async (path, expectedType) => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 1, + renderingType: 'clientOnly', + }); const url = new URL(`http://${HOSTNAME}:${port}${path}`); - const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ pushData, querySelector }) => { - await pushData({ - heading: (await querySelector('h1')).text(), - }); - }); + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn( + async ({ pushData, querySelector }) => { + await pushData({ + heading: (await querySelector('h1')).text(), + }); + }, + ); const crawler = await makeOneshotCrawler( { @@ -131,7 +142,10 @@ describe('AdaptivePlaywrightCrawler', () => { }); test('should not store detection results on non-detection runs', async () => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType: 'static', + }); const url = new URL(`http://${HOSTNAME}:${port}/static`); const crawler = await makeOneshotCrawler( @@ -149,17 +163,23 @@ describe('AdaptivePlaywrightCrawler', () => { }); test('should retry with browser if result checker returns false', async () => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType: 'static', + }); const url = new URL(`http://${HOSTNAME}:${port}/dynamic`); - const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ pushData, querySelector }) => { - await pushData({ - heading: (await querySelector('h1')).text(), - }); - }); + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn( + async ({ pushData, querySelector }) => { + await pushData({ + heading: (await querySelector('h1')).text(), + }); + }, + ); const resultChecker: AdaptivePlaywrightCrawlerOptions['resultChecker'] = vi.fn( - (result) => result.datasetItems.length > 0 && result.datasetItems.every(({ item }) => item.heading?.length > 0), + (result) => + result.datasetItems.length > 0 && result.datasetItems.every(({ item }) => item.heading?.length > 0), ); const crawler = await makeOneshotCrawler( @@ -178,13 +198,21 @@ describe('AdaptivePlaywrightCrawler', () => { }); describe('should enqueue links correctly', () => { - test.each([['/static', 'static'], ['/dynamic', 'clientOnly']] as const)('for %s', async (path, renderingType) => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType }); + test.each([ + ['/static', 'static'], + ['/dynamic', 'clientOnly'], + ] as const)('for %s', async (path, renderingType) => { + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType, + }); const url = new URL(`http://${HOSTNAME}:${port}${path}`); - const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ enqueueLinks }) => { - await enqueueLinks(); - }); + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn( + async ({ enqueueLinks }) => { + await enqueueLinks(); + }, + ); const crawler = await makeOneshotCrawler( { @@ -197,18 +225,23 @@ describe('AdaptivePlaywrightCrawler', () => { await crawler.run(); const enqueuedUrls = (await localStorageEmulator.getRequestQueueItems()).map((item) => item.url); - expect(new Set(enqueuedUrls)).toEqual(new Set([ - `http://${HOSTNAME}:${port}/static?q=1`, - `http://${HOSTNAME}:${port}/static?q=2`, - `http://${HOSTNAME}:${port}/static?q=3`, - `http://${HOSTNAME}:${port}/static?q=4`, - `http://${HOSTNAME}:${port}/static?q=5`, - ])); + expect(new Set(enqueuedUrls)).toEqual( + new Set([ + `http://${HOSTNAME}:${port}/static?q=1`, + `http://${HOSTNAME}:${port}/static?q=2`, + `http://${HOSTNAME}:${port}/static?q=3`, + `http://${HOSTNAME}:${port}/static?q=4`, + `http://${HOSTNAME}:${port}/static?q=5`, + ]), + ); }); }); test('should persist crawler state', async () => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType: 'static', + }); const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ useState }) => { const state = await useState({ count: 0 }); @@ -234,14 +267,19 @@ describe('AdaptivePlaywrightCrawler', () => { }); test('should persist key-value store changes', async () => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); - - const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async ({ request, getKeyValueStore }) => { - const store = await getKeyValueStore(); - const search = new URLSearchParams(new URL(request.url).search); - store.setValue(search.get('q'), { content: 42 }); + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType: 'static', }); + const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn( + async ({ request, getKeyValueStore }) => { + const store = await getKeyValueStore(); + const search = new URLSearchParams(new URL(request.url).search); + store.setValue(search.get('q'), { content: 42 }); + }, + ); + const crawler = await makeOneshotCrawler( { requestHandler, @@ -264,7 +302,10 @@ describe('AdaptivePlaywrightCrawler', () => { }); test('should not allow direct key-value store manipulation', async () => { - const renderingTypePredictor = makeRiggedRenderingTypePredictor({ detectionProbabilityRecommendation: 0, renderingType: 'static' }); + const renderingTypePredictor = makeRiggedRenderingTypePredictor({ + detectionProbabilityRecommendation: 0, + renderingType: 'static', + }); const requestHandler: AdaptivePlaywrightCrawlerOptions['requestHandler'] = vi.fn(async () => { const store = await KeyValueStore.open(); @@ -281,9 +322,7 @@ describe('AdaptivePlaywrightCrawler', () => { maxRequestRetries: 0, failedRequestHandler, }, - [ - `http://${HOSTNAME}:${port}/static`, - ], + [`http://${HOSTNAME}:${port}/static`], ); await crawler.run(); diff --git a/test/core/crawlers/basic_browser_crawler.ts b/test/core/crawlers/basic_browser_crawler.ts index 65d9c07f9eb0..acbefc04bb4a 100644 --- a/test/core/crawlers/basic_browser_crawler.ts +++ b/test/core/crawlers/basic_browser_crawler.ts @@ -3,12 +3,19 @@ import type { PuppeteerCrawlingContext, PuppeteerCrawlerOptions, PuppeteerGoToOp import { BrowserCrawler } from '@crawlee/puppeteer'; import type { HTTPResponse, LaunchOptions } from 'puppeteer'; -export class BrowserCrawlerTest extends BrowserCrawler<{ browserPlugins: [PuppeteerPlugin] }, LaunchOptions, PuppeteerCrawlingContext> { +export class BrowserCrawlerTest extends BrowserCrawler< + { browserPlugins: [PuppeteerPlugin] }, + LaunchOptions, + PuppeteerCrawlingContext +> { constructor(options: Partial = {}) { super(options as any); } - protected async _navigationHandler(ctx: PuppeteerCrawlingContext, gotoOptions: PuppeteerGoToOptions): Promise { + protected async _navigationHandler( + ctx: PuppeteerCrawlingContext, + gotoOptions: PuppeteerGoToOptions, + ): Promise { return ctx.page.goto(ctx.request.url, gotoOptions); } } diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 055b05b92ca0..851d375c0496 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -6,10 +6,7 @@ import { readFile, rm } from 'node:fs/promises'; import { join } from 'path'; import log from '@apify/log'; -import type { - CrawlingContext, - ErrorHandler, - RequestHandler } from '@crawlee/basic'; +import type { CrawlingContext, ErrorHandler, RequestHandler } from '@crawlee/basic'; import { Request, RequestQueue, @@ -161,39 +158,35 @@ describe('BasicCrawler', () => { const requestList = await RequestList.open(null, []); const requestHandler = async () => {}; - const results = await Promise.all([ - new BasicCrawler({ - requestList, - requestHandler, - ...shorthandOptions, - }), - new BasicCrawler({ - requestList, - requestHandler, - autoscaledPoolOptions, + const results = await Promise.all( + [ + new BasicCrawler({ + requestList, + requestHandler, + ...shorthandOptions, + }), + new BasicCrawler({ + requestList, + requestHandler, + autoscaledPoolOptions, + }), + new BasicCrawler({ + requestList, + requestHandler, + ...shorthandOptions, + autoscaledPoolOptions, + }), + ].map(async (c) => { + await c.run(); + return collectResults(c); }), - new BasicCrawler({ - requestList, - requestHandler, - ...shorthandOptions, - autoscaledPoolOptions, - }), - ].map(async (c) => { - await c.run(); - return collectResults(c); - })); - - expect(results[0]).toEqual( - expect.objectContaining(shorthandOptions), ); - expect(results[1]).toEqual( - expect.objectContaining(autoscaledPoolOptions), - ); + expect(results[0]).toEqual(expect.objectContaining(shorthandOptions)); - expect(results[2]).toEqual( - expect.objectContaining(shorthandOptions), - ); + expect(results[1]).toEqual(expect.objectContaining(autoscaledPoolOptions)); + + expect(results[2]).toEqual(expect.objectContaining(shorthandOptions)); }); test('auto-saved state object', async () => { @@ -224,54 +217,61 @@ describe('BasicCrawler', () => { expect(await requestList.isEmpty()).toBe(true); }); - test.each([EventType.MIGRATING, EventType.ABORTING])('should pause on %s event and persist RequestList state', async (event) => { - const sources = [...Array(500).keys()].map((index) => ({ url: `https://example.com/${index + 1}` })); + test.each([EventType.MIGRATING, EventType.ABORTING])( + 'should pause on %s event and persist RequestList state', + async (event) => { + const sources = [...Array(500).keys()].map((index) => ({ url: `https://example.com/${index + 1}` })); - let persistResolve: (value?: unknown) => void; - const persistPromise = new Promise((res) => { persistResolve = res; }); + let persistResolve: (value?: unknown) => void; + const persistPromise = new Promise((res) => { + persistResolve = res; + }); - // Mock the calls to persist sources. - const getValueSpy = vitest.spyOn(KeyValueStore.prototype, 'getValue'); - const setValueSpy = vitest.spyOn(KeyValueStore.prototype, 'setValue'); - getValueSpy.mockResolvedValue(null); + // Mock the calls to persist sources. + const getValueSpy = vitest.spyOn(KeyValueStore.prototype, 'getValue'); + const setValueSpy = vitest.spyOn(KeyValueStore.prototype, 'setValue'); + getValueSpy.mockResolvedValue(null); - const processed: { url: string }[] = []; - const requestList = await RequestList.open('reqList', sources); - const requestHandler: RequestHandler = async ({ request }) => { - if (request.url.endsWith('200')) events.emit(event); - processed.push({ url: request.url }); - }; + const processed: { url: string }[] = []; + const requestList = await RequestList.open('reqList', sources); + const requestHandler: RequestHandler = async ({ request }) => { + if (request.url.endsWith('200')) events.emit(event); + processed.push({ url: request.url }); + }; - const basicCrawler = new BasicCrawler({ - requestList, - minConcurrency: 25, - maxConcurrency: 25, - requestHandler, - }); + const basicCrawler = new BasicCrawler({ + requestList, + minConcurrency: 25, + maxConcurrency: 25, + requestHandler, + }); - let finished = false; - // Mock the call to persist state. - setValueSpy.mockImplementationOnce(persistResolve as any); - // The crawler will pause after 200 requests - const runPromise = basicCrawler.run(); - void runPromise.then(() => { finished = true; }); + let finished = false; + // Mock the call to persist state. + setValueSpy.mockImplementationOnce(persistResolve as any); + // The crawler will pause after 200 requests + const runPromise = basicCrawler.run(); + void runPromise.then(() => { + finished = true; + }); - // need to monkeypatch the stats class, otherwise it will never finish - basicCrawler.stats.persistState = async () => Promise.resolve(); - await persistPromise; + // need to monkeypatch the stats class, otherwise it will never finish + basicCrawler.stats.persistState = async () => Promise.resolve(); + await persistPromise; - expect(finished).toBe(false); - expect(await requestList.isFinished()).toBe(false); - expect(await requestList.isEmpty()).toBe(false); - expect(processed.length).toBe(200); + expect(finished).toBe(false); + expect(await requestList.isFinished()).toBe(false); + expect(await requestList.isEmpty()).toBe(false); + expect(processed.length).toBe(200); - expect(getValueSpy).toBeCalled(); - expect(setValueSpy).toBeCalled(); + expect(getValueSpy).toBeCalled(); + expect(setValueSpy).toBeCalled(); - // clean up - // @ts-expect-error Accessing private method - await basicCrawler.autoscaledPool._destroy(); - }); + // clean up + // @ts-expect-error Accessing private method + await basicCrawler.autoscaledPool._destroy(); + }, + ); test('should retry failed requests', async () => { const sources = [ @@ -406,9 +406,7 @@ describe('BasicCrawler', () => { }); test('should correctly track request.state', async () => { - const sources = [ - { url: 'http://example.com/1' }, - ]; + const sources = [{ url: 'http://example.com/1' }]; const requestList = await RequestList.open(null, sources); const requestStates: RequestState[] = []; @@ -431,7 +429,11 @@ describe('BasicCrawler', () => { await basicCrawler.run(); - expect(requestStates).toEqual([RequestState.REQUEST_HANDLER, RequestState.ERROR_HANDLER, RequestState.REQUEST_HANDLER]); + expect(requestStates).toEqual([ + RequestState.REQUEST_HANDLER, + RequestState.ERROR_HANDLER, + RequestState.REQUEST_HANDLER, + ]); }); test('should use errorHandler', async () => { @@ -662,7 +664,8 @@ describe('BasicCrawler', () => { vitest.spyOn(requestQueue, 'handledCount').mockResolvedValueOnce(0); - vitest.spyOn(requestQueue, 'addRequest') + vitest + .spyOn(requestQueue, 'addRequest') .mockResolvedValueOnce({ requestId: 'id-0' } as any) .mockResolvedValueOnce({ requestId: 'id-1' } as any) .mockResolvedValueOnce({ requestId: 'id-2' } as any); @@ -671,7 +674,8 @@ describe('BasicCrawler', () => { const request1 = new Request({ id: 'id-1', ...sources[1] }); const request2 = new Request({ id: 'id-2', ...sources[2] }); - vitest.spyOn(requestQueue, 'fetchNextRequest') + vitest + .spyOn(requestQueue, 'fetchNextRequest') .mockResolvedValueOnce(request0) .mockResolvedValueOnce(request1) .mockResolvedValueOnce(request2) @@ -679,17 +683,19 @@ describe('BasicCrawler', () => { .mockResolvedValueOnce(request1) .mockResolvedValueOnce(request1); - const markReqHandled = vitest.spyOn(requestQueue, 'markRequestHandled').mockReturnValue(Promise.resolve() as any); + const markReqHandled = vitest + .spyOn(requestQueue, 'markRequestHandled') + .mockReturnValue(Promise.resolve() as any); const reclaimReq = vitest.spyOn(requestQueue, 'reclaimRequest').mockReturnValue(Promise.resolve() as any); - vitest.spyOn(requestQueue, 'isEmpty') + vitest + .spyOn(requestQueue, 'isEmpty') .mockResolvedValueOnce(false) .mockResolvedValueOnce(false) .mockResolvedValueOnce(false) .mockResolvedValueOnce(true); - vitest.spyOn(requestQueue, 'isFinished') - .mockResolvedValueOnce(true); + vitest.spyOn(requestQueue, 'isFinished').mockResolvedValueOnce(true); await basicCrawler.run(); @@ -757,7 +763,8 @@ describe('BasicCrawler', () => { const request1 = new Request({ url: 'http://example.com/1' }); vitest.spyOn(requestQueue, 'handledCount').mockReturnValue(Promise.resolve() as any); - const markRequestHandled = vitest.spyOn(requestQueue, 'markRequestHandled') + const markRequestHandled = vitest + .spyOn(requestQueue, 'markRequestHandled') .mockReturnValue(Promise.resolve() as any); const isFinishedOrig = vitest.spyOn(requestQueue, 'isFinished'); @@ -767,7 +774,9 @@ describe('BasicCrawler', () => { setTimeout(() => queue.push(request0), 10); setTimeout(() => queue.push(request1), 100); - setTimeout(() => { isFinished = true; }, 150); + setTimeout(() => { + isFinished = true; + }, 150); await basicCrawler.run(); @@ -803,7 +812,8 @@ describe('BasicCrawler', () => { const request1 = new Request({ url: 'http://example.com/1' }); vitest.spyOn(requestQueue, 'handledCount').mockReturnValue(Promise.resolve() as any); - const markRequestHandled = vitest.spyOn(requestQueue, 'markRequestHandled') + const markRequestHandled = vitest + .spyOn(requestQueue, 'markRequestHandled') .mockReturnValue(Promise.resolve() as any); const isFinishedOrig = vitest.spyOn(requestQueue, 'isFinished'); @@ -813,7 +823,9 @@ describe('BasicCrawler', () => { setTimeout(() => queue.push(request0), 10); setTimeout(() => queue.push(request1), 100); - setTimeout(() => { void basicCrawler.teardown(); }, 300); + setTimeout(() => { + void basicCrawler.teardown(); + }, 300); await basicCrawler.run(); @@ -883,7 +895,7 @@ describe('BasicCrawler', () => { requestQueue.isEmpty = async () => false; requestQueue.isFinished = async () => false; - requestQueue.fetchNextRequest = async () => (new Request({ id: 'id', url: 'http://example.com' })); + requestQueue.fetchNextRequest = async () => new Request({ id: 'id', url: 'http://example.com' }); // @ts-expect-error Overriding the method for testing purposes requestQueue.markRequestHandled = async () => {}; @@ -1400,8 +1412,7 @@ describe('BasicCrawler', () => { await crawler.pushData(payload); - expect((await crawler.getData()).items) - .toEqual(payload); + expect((await crawler.getData()).items).toEqual(payload); }); test('export data', async () => { @@ -1418,20 +1429,22 @@ describe('BasicCrawler', () => { const csv = await readFile(`${tmpDir}/result.csv`); expect(csv.toString()).toBe('foo,baz\nbar,123\nbar,123\nbar,123\n'); const json = await readFile(`${tmpDir}/result.json`); - expect(json.toString()).toBe('[\n' - + ' {\n' - + ' "foo": "bar",\n' - + ' "baz": 123\n' - + ' },\n' - + ' {\n' - + ' "foo": "bar",\n' - + ' "baz": 123\n' - + ' },\n' - + ' {\n' - + ' "foo": "bar",\n' - + ' "baz": 123\n' - + ' }\n' - + ']\n'); + expect(json.toString()).toBe( + '[\n' + + ' {\n' + + ' "foo": "bar",\n' + + ' "baz": 123\n' + + ' },\n' + + ' {\n' + + ' "foo": "bar",\n' + + ' "baz": 123\n' + + ' },\n' + + ' {\n' + + ' "foo": "bar",\n' + + ' "baz": 123\n' + + ' }\n' + + ']\n', + ); await rm(`${tmpDir}/result.csv`); await rm(`${tmpDir}/result.json`); @@ -1442,12 +1455,13 @@ describe('BasicCrawler', () => { requestHandler: async ({ pushData }) => pushData(payload), }); - await crawler.run([{ - url: `http://${HOSTNAME}:${port}`, - }]); + await crawler.run([ + { + url: `http://${HOSTNAME}:${port}`, + }, + ]); - expect((await crawler.getData()).items) - .toEqual(payload); + expect((await crawler.getData()).items).toEqual(payload); }); test("Crawlers with different Configurations don't share Datasets", async () => { @@ -1457,16 +1471,20 @@ describe('BasicCrawler', () => { await crawlerA.pushData(getPayload('A')); await crawlerB.pushData(getPayload('B')); - expect((await crawlerA.getData()).items) - .toEqual(getPayload('A')); + expect((await crawlerA.getData()).items).toEqual(getPayload('A')); - expect((await crawlerB.getData()).items) - .toEqual(getPayload('B')); + expect((await crawlerB.getData()).items).toEqual(getPayload('B')); }); test('Crawlers with different Configurations run separately', async () => { - const crawlerA = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false })); - const crawlerB = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false })); + const crawlerA = new BasicCrawler( + { requestHandler: () => {} }, + new Configuration({ persistStorage: false }), + ); + const crawlerB = new BasicCrawler( + { requestHandler: () => {} }, + new Configuration({ persistStorage: false }), + ); await crawlerA.run([{ url: `http://${HOSTNAME}:${port}` }]); await crawlerB.run([{ url: `http://${HOSTNAME}:${port}` }]); diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index c72c0d7bee2a..3ec2540db2d8 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -4,11 +4,7 @@ import { ENV_VARS } from '@apify/consts'; import log from '@apify/log'; import { BROWSER_POOL_EVENTS, BrowserPool, OperatingSystemsName, PuppeteerPlugin } from '@crawlee/browser-pool'; import { BLOCKED_STATUS_CODES } from '@crawlee/core'; -import type { - PuppeteerCrawlingContext, - PuppeteerGoToOptions, - PuppeteerRequestHandler, -} from '@crawlee/puppeteer'; +import type { PuppeteerCrawlingContext, PuppeteerGoToOptions, PuppeteerRequestHandler } from '@crawlee/puppeteer'; import { AutoscaledPool, EnqueueStrategy, @@ -110,9 +106,7 @@ describe('BrowserCrawler', () => { test('should teardown browser pool', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); const browserCrawler = new BrowserCrawlerTest({ browserPoolOptions: { @@ -131,19 +125,19 @@ describe('BrowserCrawler', () => { test('should retire session after TimeoutError', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); class TimeoutError extends Error {} let sessionGoto: Session; - const browserCrawler = new class extends BrowserCrawlerTest { - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const browserCrawler = new (class extends BrowserCrawlerTest { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + ): Promise { vitest.spyOn(ctx.session, 'markBad'); sessionGoto = ctx.session; throw new TimeoutError(); } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -159,19 +153,20 @@ describe('BrowserCrawler', () => { test('should evaluate preNavigationHooks', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); let isEvaluated = false; - const browserCrawler = new class extends BrowserCrawlerTest { + const browserCrawler = new (class extends BrowserCrawlerTest { // eslint-disable-next-line max-len - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext, gotoOptions: PuppeteerGoToOptions): Promise { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + gotoOptions: PuppeteerGoToOptions, + ): Promise { isEvaluated = ctx.hookFinished as boolean; return ctx.page.goto(ctx.request.url, gotoOptions); } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -194,9 +189,7 @@ describe('BrowserCrawler', () => { test('should evaluate postNavigationHooks', async () => { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); let isEvaluated = false; @@ -225,9 +218,7 @@ describe('BrowserCrawler', () => { test('errorHandler has open page', async () => { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); const result: string[] = []; @@ -253,9 +244,7 @@ describe('BrowserCrawler', () => { }); test('should correctly track request.state', async () => { - const sources = [ - { url: `${serverAddress}/?q=1` }, - ]; + const sources = [{ url: `${serverAddress}/?q=1` }]; const requestList = await RequestList.open(null, sources); const requestStates: RequestState[] = []; @@ -299,18 +288,19 @@ describe('BrowserCrawler', () => { test('should allow modifying gotoOptions by pre navigation hooks', async () => { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); let optionsGoto: PuppeteerGoToOptions; - const browserCrawler = new class extends BrowserCrawlerTest { + const browserCrawler = new (class extends BrowserCrawlerTest { // eslint-disable-next-line max-len - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext, gotoOptions: PuppeteerGoToOptions): Promise { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + gotoOptions: PuppeteerGoToOptions, + ): Promise { optionsGoto = gotoOptions; return ctx.page.goto(ctx.request.url, gotoOptions); } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -333,9 +323,7 @@ describe('BrowserCrawler', () => { test('should ignore errors in Page.close()', async () => { for (let i = 0; i < 2; i++) { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); let failedCalled = false; @@ -365,9 +353,7 @@ describe('BrowserCrawler', () => { test('should respect the requestHandlerTimeoutSecs option', async () => { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); const callSpy = vitest.fn(); @@ -393,9 +379,7 @@ describe('BrowserCrawler', () => { test('should not throw without SessionPool', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); const browserCrawler = new BrowserCrawlerTest({ browserPoolOptions: { @@ -404,7 +388,6 @@ describe('BrowserCrawler', () => { requestList, useSessionPool: false, requestHandler: async () => {}, - }); expect(browserCrawler).toBeDefined(); @@ -412,9 +395,7 @@ describe('BrowserCrawler', () => { test('should correctly set session pool options', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); const crawler = new BrowserCrawlerTest({ @@ -466,7 +447,12 @@ describe('BrowserCrawler', () => { }, preNavigationHooks: [ async ({ session, page }) => { - await page.setCookie({ name: 'TEST', value: '12321312312', domain: 'example.com', expires: Date.now() + 100000 }); + await page.setCookie({ + name: 'TEST', + value: '12321312312', + domain: 'example.com', + expires: Date.now() + 100000, + }); goToPageSessions.push(session); }, ], @@ -645,9 +631,7 @@ describe('BrowserCrawler', () => { test('should retire browser with session', async () => { const requestList = await RequestList.open({ - sources: [ - { url: 'http://example.com/?q=1' }, - ], + sources: [{ url: 'http://example.com/?q=1' }], }); let resolve: (value?: unknown) => void; @@ -655,8 +639,10 @@ describe('BrowserCrawler', () => { resolve = r; }); let called = false; - const browserCrawler = new class extends BrowserCrawlerTest { - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const browserCrawler = new (class extends BrowserCrawlerTest { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + ): Promise { ctx.crawler.browserPool.on(BROWSER_POOL_EVENTS.BROWSER_RETIRED, () => { resolve(); called = true; @@ -664,7 +650,7 @@ describe('BrowserCrawler', () => { ctx.session.retire(); return ctx.page.goto(ctx.request.url); } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -683,9 +669,7 @@ describe('BrowserCrawler', () => { test('should allow using fingerprints from browser pool', async () => { const requestList = await RequestList.open({ - sources: [ - { url: `${serverAddress}/?q=1` }, - ], + sources: [{ url: `${serverAddress}/?q=1` }], }); const browserCrawler = new BrowserCrawlerTest({ browserPoolOptions: { @@ -823,8 +807,7 @@ describe('BrowserCrawler', () => { retireBrowserAfterPageCount: 1, }, requestList, - requestHandler: async () => { - }, + requestHandler: async () => {}, proxyConfiguration, maxRequestRetries: 0, maxConcurrency: 1, @@ -847,11 +830,15 @@ describe('BrowserCrawler', () => { test('proxy rotation on error works as expected', async () => { const goodProxyUrl = 'http://good.proxy'; - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl], + }); const requestHandler = vitest.fn(); - const browserCrawler = new class extends BrowserCrawlerTest { - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const browserCrawler = new (class extends BrowserCrawlerTest { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + ): Promise { const { session } = ctx; const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); @@ -861,7 +848,7 @@ describe('BrowserCrawler', () => { return null; } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -878,15 +865,19 @@ describe('BrowserCrawler', () => { }); test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => { - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: ['http://localhost', 'http://localhost:1234'], + }); const failedRequestHandler = vitest.fn(); /** * The first increment is the base case when the proxy is retrieved for the first time. */ let numberOfRotations = -requestList.length(); - const browserCrawler = new class extends BrowserCrawlerTest { - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const browserCrawler = new (class extends BrowserCrawlerTest { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + ): Promise { const { session } = ctx; const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); @@ -898,7 +889,7 @@ describe('BrowserCrawler', () => { return null; } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -918,10 +909,13 @@ describe('BrowserCrawler', () => { test('proxy rotation logs the original proxy error', async () => { const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost:1234'] }); - const proxyError = 'Proxy responded with 400 - Bad request. Also, this error message contains some useful payload.'; + const proxyError = + 'Proxy responded with 400 - Bad request. Also, this error message contains some useful payload.'; - const crawler = new class extends BrowserCrawlerTest { - protected override async _navigationHandler(ctx: PuppeteerCrawlingContext): Promise { + const crawler = new (class extends BrowserCrawlerTest { + protected override async _navigationHandler( + ctx: PuppeteerCrawlingContext, + ): Promise { const { session } = ctx; const proxyInfo = await this.proxyConfiguration.newProxyInfo(session?.id); @@ -931,7 +925,7 @@ describe('BrowserCrawler', () => { return null; } - }({ + })({ browserPoolOptions: { browserPlugins: [puppeteerPlugin], }, @@ -948,7 +942,9 @@ describe('BrowserCrawler', () => { expect(spy).toBeCalled(); // eslint-disable-next-line max-len - expect(spy.mock.calls[0][0]).toEqual('When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.'); + expect(spy.mock.calls[0][0]).toEqual( + 'When using RequestList and RequestQueue at the same time, you should instantiate both explicitly and provide them in the crawler options, to ensure correctly handled restarts of the crawler.', + ); expect(spy.mock.calls[1][0]).toEqual(expect.stringContaining(proxyError)); }); }); @@ -997,7 +993,7 @@ describe('BrowserCrawler', () => { expect(crawlingContext.session).toBeInstanceOf(Session); expect(typeof crawlingContext.page).toBe('object'); expect(crawlingContext.crawler).toBeInstanceOf(BrowserCrawlerTest); - expect((crawlingContext.crawler).browserPool).toBeInstanceOf(BrowserPool); + expect(crawlingContext.crawler.browserPool).toBeInstanceOf(BrowserPool); expect(crawlingContext.hasOwnProperty('response')).toBe(true); expect(crawlingContext.error).toBeInstanceOf(Error); diff --git a/test/core/crawlers/cheerio_crawler.test.ts b/test/core/crawlers/cheerio_crawler.test.ts index aa0dc169e467..43158e78f66e 100644 --- a/test/core/crawlers/cheerio_crawler.test.ts +++ b/test/core/crawlers/cheerio_crawler.test.ts @@ -2,12 +2,7 @@ import type { IncomingHttpHeaders, Server } from 'http'; import { Readable } from 'stream'; import log, { Log, LogLevel } from '@apify/log'; -import type { - CheerioRequestHandler, - CheerioCrawlingContext, - ProxyInfo, - Source, -} from '@crawlee/cheerio'; +import type { CheerioRequestHandler, CheerioCrawlingContext, ProxyInfo, Source } from '@crawlee/cheerio'; import { AutoscaledPool, CheerioCrawler, @@ -197,13 +192,13 @@ describe('CheerioCrawler', () => { }); // eslint-disable-next-line max-len - await expect(cheerioCrawler.run()).rejects.toThrow("Route not found for label 'undefined'. You must set up a route for this label or a default route. Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`."); + await expect(cheerioCrawler.run()).rejects.toThrow( + "Route not found for label 'undefined'. You must set up a route for this label or a default route. Use `requestHandler`, `router.addHandler` or `router.addDefaultHandler`.", + ); }); test('should ignore ssl by default', async () => { - const sources = [ - { url: 'http://example.com/?q=1' }, - ]; + const sources = [{ url: 'http://example.com/?q=1' }]; const requestList = await RequestList.open(null, sources); const requestHandler = () => {}; @@ -366,10 +361,7 @@ describe('CheerioCrawler', () => { describe('should not timeout by the default httpRequest timeoutSecs', () => { it('when navigationTimeoutSecs is greater than 30', async () => { - const sources = [ - { url: `${serverAddress}/timeout?a=12` }, - { url: `${serverAddress}/timeout?a=23` }, - ]; + const sources = [{ url: `${serverAddress}/timeout?a=12` }, { url: `${serverAddress}/timeout?a=23` }]; const processed: Request[] = []; const failed: Request[] = []; const requestList = await RequestList.open(null, sources); @@ -454,10 +446,12 @@ describe('CheerioCrawler', () => { expect(handlePageInvocationCount).toBe(0); expect(errorMessages).toHaveLength(4); - errorMessages.forEach((msg) => expect(msg).toMatch( - ' Content-Type text/plain, but only text/html, text/xml, application/xhtml+xml, application/xml, application/json are allowed.' - + ' Skipping resource.', - )); + errorMessages.forEach((msg) => + expect(msg).toMatch( + ' Content-Type text/plain, but only text/html, text/xml, application/xhtml+xml, application/xml, application/json are allowed.' + + ' Skipping resource.', + ), + ); }); test('when statusCode >= 500 and text/html is received', async () => { @@ -525,7 +519,9 @@ describe('CheerioCrawler', () => { expect(handlePageInvocationCount).toBe(0); expect(errorMessages).toHaveLength(4); errorMessages.forEach((msg) => { - expect(msg).toMatch('is not available in the format requested by the Accept header. Skipping resource.'); + expect(msg).toMatch( + 'is not available in the format requested by the Accept header. Skipping resource.', + ); }); }); }); @@ -769,10 +765,12 @@ describe('CheerioCrawler', () => { test('proxy rotation on error works as expected', async () => { const goodProxyUrl = 'http://good.proxy'; - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl] }); + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: ['http://localhost', 'http://localhost:1234', goodProxyUrl], + }); const check = vitest.fn(); - const crawler = new class extends CheerioCrawler { + const crawler = new (class extends CheerioCrawler { protected override async _requestFunction(...args: any[]): Promise { check(...args); @@ -782,7 +780,7 @@ describe('CheerioCrawler', () => { throw new Error('Proxy responded with 400 - Bad request'); } - }({ + })({ maxSessionRotations: 2, maxConcurrency: 1, useSessionPool: true, @@ -795,7 +793,9 @@ describe('CheerioCrawler', () => { }); test('proxy rotation on error respects maxSessionRotations, calls failedRequestHandler', async () => { - const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost', 'http://localhost:1234'] }); + const proxyConfiguration = new ProxyConfiguration({ + proxyUrls: ['http://localhost', 'http://localhost:1234'], + }); /** * The first increment is the base case when the proxy is retrieved for the first time. @@ -826,7 +826,8 @@ describe('CheerioCrawler', () => { test('proxy rotation logs the original proxy error', async () => { const proxyConfiguration = new ProxyConfiguration({ proxyUrls: ['http://localhost:1234'] }); - const proxyError = 'Proxy responded with 400 - Bad request. Also, this error message contains some useful payload.'; + const proxyError = + 'Proxy responded with 400 - Bad request. Also, this error message contains some useful payload.'; const crawler = new CheerioCrawler({ proxyConfiguration, @@ -962,7 +963,9 @@ describe('CheerioCrawler', () => { expect(failed.length).toBe(4); failed.forEach((request) => { - expect(request.errorMessages[0].includes(`Request blocked - received ${code} status code`)).toBeTruthy(); + expect( + request.errorMessages[0].includes(`Request blocked - received ${code} status code`), + ).toBeTruthy(); }); } }); @@ -975,11 +978,12 @@ describe('CheerioCrawler', () => { useSessionPool: false, persistCookiesPerSession: true, maxRequestRetries: 0, - requestHandler: () => { - }, + requestHandler: () => {}, }); } catch (e) { - expect((e as Error).message).toEqual('You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.'); + expect((e as Error).message).toEqual( + 'You cannot use "persistCookiesPerSession" without "useSessionPool" set to true.', + ); } }); @@ -987,10 +991,13 @@ describe('CheerioCrawler', () => { const cookie = 'SESSID=abcd123'; const requests: Request[] = []; const crawler = new CheerioCrawler({ - requestList: await getRequestListForMock({ - headers: { 'set-cookie': cookie, 'content-type': 'text/html' }, - statusCode: 200, - }, '/getRawHeaders'), + requestList: await getRequestListForMock( + { + headers: { 'set-cookie': cookie, 'content-type': 'text/html' }, + statusCode: 200, + }, + '/getRawHeaders', + ), useSessionPool: true, persistCookiesPerSession: true, sessionPoolOptions: { @@ -1001,7 +1008,6 @@ describe('CheerioCrawler', () => { requestHandler: ({ request }) => { requests.push(request); }, - }); await crawler.run(); @@ -1018,10 +1024,12 @@ describe('CheerioCrawler', () => { const responses: unknown[] = []; const gotOptions: OptionsInit[] = []; const crawler = new CheerioCrawler({ - requestList: await RequestList.open(null, [{ - url: `${serverAddress}/special/headers`, - headers: { cookie: 'foo=bar2; baz=123' }, - }]), + requestList: await RequestList.open(null, [ + { + url: `${serverAddress}/special/headers`, + headers: { cookie: 'foo=bar2; baz=123' }, + }, + ]), useSessionPool: true, persistCookiesPerSession: false, sessionPoolOptions: { @@ -1030,9 +1038,11 @@ describe('CheerioCrawler', () => { requestHandler: ({ json }) => { responses.push(json); }, - preNavigationHooks: [(_context, options) => { - gotOptions.push(options); - }], + preNavigationHooks: [ + (_context, options) => { + gotOptions.push(options); + }, + ], }); const sessSpy = vitest.spyOn(Session.prototype, 'getCookieString'); @@ -1055,10 +1065,12 @@ describe('CheerioCrawler', () => { test('should work with cookies adjusted on `context.request` in pre-nav hook', async () => { const responses: unknown[] = []; const crawler = new CheerioCrawler({ - requestList: await RequestList.open(null, [{ - url: `${serverAddress}/special/headers`, - headers: { cookie: 'foo=bar2; baz=123' }, - }]), + requestList: await RequestList.open(null, [ + { + url: `${serverAddress}/special/headers`, + headers: { cookie: 'foo=bar2; baz=123' }, + }, + ]), useSessionPool: true, persistCookiesPerSession: false, sessionPoolOptions: { @@ -1067,9 +1079,11 @@ describe('CheerioCrawler', () => { requestHandler: ({ json }) => { responses.push(json); }, - preNavigationHooks: [({ request }) => { - request.headers.Cookie = 'foo=override; coo=kie'; - }], + preNavigationHooks: [ + ({ request }) => { + request.headers.Cookie = 'foo=override; coo=kie'; + }, + ], }); await crawler.run(); @@ -1085,17 +1099,21 @@ describe('CheerioCrawler', () => { const requests: Request[] = []; const responses: unknown[] = []; const crawler = new CheerioCrawler({ - requestList: await RequestList.open(null, [{ - url: `${serverAddress}/special/headers`, - }]), + requestList: await RequestList.open(null, [ + { + url: `${serverAddress}/special/headers`, + }, + ]), useSessionPool: true, requestHandler: async ({ json, request }) => { responses.push(json); requests.push(request); }, - preNavigationHooks: [({ request }) => { - request.headers.Cookie = 'foo=override; coo=kie'; - }], + preNavigationHooks: [ + ({ request }) => { + request.headers.Cookie = 'foo=override; coo=kie'; + }, + ], }); await crawler.run(); @@ -1126,8 +1144,12 @@ describe('CheerioCrawler', () => { ]); expect(cookie2).toBe('Foo=bar1; other=cookie1; coo=kie; foo=bar3; baz=123; Other=cookie2'); expect(deprecatedSpy).toBeCalledTimes(3); - expect(deprecatedSpy).toBeCalledWith(`Found cookies with similar name during cookie merging: 'foo' and 'Foo'`); - expect(deprecatedSpy).toBeCalledWith(`Found cookies with similar name during cookie merging: 'Other' and 'other'`); + expect(deprecatedSpy).toBeCalledWith( + `Found cookies with similar name during cookie merging: 'foo' and 'Foo'`, + ); + expect(deprecatedSpy).toBeCalledWith( + `Found cookies with similar name during cookie merging: 'Other' and 'other'`, + ); deprecatedSpy.mockClear(); const cookie3 = mergeCookies('https://example.com', [ @@ -1137,14 +1159,16 @@ describe('CheerioCrawler', () => { ]); expect(cookie3).toBe('foo=bar2; Other=cookie2; Coo=kie; baz=123; Foo=bar3; coo=kee'); expect(deprecatedSpy).toBeCalledTimes(2); - expect(deprecatedSpy).toBeCalledWith(`Found cookies with similar name during cookie merging: 'Foo' and 'foo'`); - expect(deprecatedSpy).toBeCalledWith(`Found cookies with similar name during cookie merging: 'coo' and 'Coo'`); + expect(deprecatedSpy).toBeCalledWith( + `Found cookies with similar name during cookie merging: 'Foo' and 'foo'`, + ); + expect(deprecatedSpy).toBeCalledWith( + `Found cookies with similar name during cookie merging: 'coo' and 'Coo'`, + ); }); test('should use sessionId in proxyUrl when the session pool is enabled', async () => { - const sourcesNew = [ - { url: 'http://example.com/?q=1' }, - ]; + const sourcesNew = [{ url: 'http://example.com/?q=1' }]; const requestListNew = await RequestList.open({ sources: sourcesNew }); let usedSession: Session; @@ -1262,10 +1286,8 @@ describe('CheerioCrawler', () => { const cheerioCrawler = new CheerioCrawler({ requestList, maxRequestRetries: 0, - requestHandler: () => { - }, - failedRequestHandler: () => { - }, + requestHandler: () => {}, + failedRequestHandler: () => {}, }); expect( // @ts-expect-error Validating JS side checks @@ -1283,10 +1305,9 @@ describe('CheerioCrawler', () => { requestHandler: () => {}, failedRequestHandler: () => {}, }); - expect( - () => cheerioCrawler.use(extension), - ) - .toThrow('DummyExtension tries to set property "doesNotExist" that is not configurable on CheerioCrawler instance.'); + expect(() => cheerioCrawler.use(extension)).toThrow( + 'DummyExtension tries to set property "doesNotExist" that is not configurable on CheerioCrawler instance.', + ); }); test('should override crawler properties', () => { @@ -1298,10 +1319,8 @@ describe('CheerioCrawler', () => { requestList, useSessionPool: false, maxRequestRetries: 0, - requestHandler: () => { - }, - failedRequestHandler: () => { - }, + requestHandler: () => {}, + failedRequestHandler: () => {}, }); // @ts-expect-error Accessing private prop expect(cheerioCrawler.useSessionPool).toEqual(false); diff --git a/test/core/crawlers/crawler_extension.test.ts b/test/core/crawlers/crawler_extension.test.ts index ab7e082d9595..6949953d13b5 100644 --- a/test/core/crawlers/crawler_extension.test.ts +++ b/test/core/crawlers/crawler_extension.test.ts @@ -2,10 +2,12 @@ import { CrawlerExtension } from '@crawlee/core'; describe('CrawlerExtension', () => { test('should work', () => { - class MyExtension extends CrawlerExtension { } + class MyExtension extends CrawlerExtension {} const myExtension = new MyExtension(); expect(myExtension.name).toEqual('MyExtension'); - expect(() => myExtension.getCrawlerOptions()).toThrow(`${myExtension.name} has not implemented "getCrawlerOptions" method.`); + expect(() => myExtension.getCrawlerOptions()).toThrow( + `${myExtension.name} has not implemented "getCrawlerOptions" method.`, + ); expect(myExtension.log.info).toBeDefined(); // @ts-expect-error Accessing private prop expect(myExtension.log.options.prefix).toEqual('MyExtension'); diff --git a/test/core/crawlers/dom_crawler.test.ts b/test/core/crawlers/dom_crawler.test.ts index 2b7e665c1859..1549ce9c5ffb 100644 --- a/test/core/crawlers/dom_crawler.test.ts +++ b/test/core/crawlers/dom_crawler.test.ts @@ -23,10 +23,12 @@ beforeAll(async () => { } }); - await new Promise((resolve) => server.listen(() => { - url = `http://127.0.0.1:${(server.address() as AddressInfo).port}`; - resolve(); - })); + await new Promise((resolve) => + server.listen(() => { + url = `http://127.0.0.1:${(server.address() as AddressInfo).port}`; + resolve(); + }), + ); }); afterAll(async (cb) => { @@ -55,8 +57,5 @@ test('works', async () => { await crawler.run([url]); - expect(results).toStrictEqual([ - 'Example Domain', - 'Hello, world!', - ]); + expect(results).toStrictEqual(['Example Domain', 'Hello, world!']); }); diff --git a/test/core/crawlers/http_crawler.test.ts b/test/core/crawlers/http_crawler.test.ts index b8678713e0d7..513c8bbf3281 100644 --- a/test/core/crawlers/http_crawler.test.ts +++ b/test/core/crawlers/http_crawler.test.ts @@ -71,10 +71,12 @@ beforeAll(async () => { } }); - await new Promise((resolve) => server.listen(() => { - url = `http://127.0.0.1:${(server.address() as AddressInfo).port}`; - resolve(); - })); + await new Promise((resolve) => + server.listen(() => { + url = `http://127.0.0.1:${(server.address() as AddressInfo).port}`; + resolve(); + }), + ); }); afterAll(async () => { @@ -218,9 +220,7 @@ test('handles cookies from redirects', async () => { await crawler.run([`${url}/redirectAndCookies`]); - expect(results).toStrictEqual([ - 'foo=bar', - ]); + expect(results).toStrictEqual(['foo=bar']); }); test('handles cookies from redirects - no empty cookie header', async () => { @@ -337,10 +337,12 @@ test('should work with delete requests', async () => { }, }); - await cheerioCrawler.run([{ - url: `${url}`, - method: 'DELETE', - }]); + await cheerioCrawler.run([ + { + url: `${url}`, + method: 'DELETE', + }, + ]); expect(failed).toHaveLength(0); }); @@ -351,15 +353,17 @@ test('should retry on 403 even with disallowed content-type', async () => { const crawler = new HttpCrawler({ maxConcurrency: 1, maxRequestRetries: 1, - preNavigationHooks: [async ({ request }) => { - // mock 403 response with octet stream on first request attempt, but not on - // subsequent retries, so the request should eventually succeed - if (request.retryCount === 0) { - request.url = `${url}/403-with-octet-stream`; - } else { - request.url = url; - } - }], + preNavigationHooks: [ + async ({ request }) => { + // mock 403 response with octet stream on first request attempt, but not on + // subsequent retries, so the request should eventually succeed + if (request.retryCount === 0) { + request.url = `${url}/403-with-octet-stream`; + } else { + request.url = url; + } + }, + ], requestHandler: async ({ request }) => { succeeded.push(request); }, diff --git a/test/core/crawlers/playwright_crawler.test.ts b/test/core/crawlers/playwright_crawler.test.ts index 5d2c5282b3f9..5ba46d62a2c9 100644 --- a/test/core/crawlers/playwright_crawler.test.ts +++ b/test/core/crawlers/playwright_crawler.test.ts @@ -3,15 +3,8 @@ import type { AddressInfo } from 'net'; import os from 'os'; import log from '@apify/log'; -import type { - PlaywrightGotoOptions, - PlaywrightRequestHandler, - Request, -} from '@crawlee/playwright'; -import { - PlaywrightCrawler, - RequestList, -} from '@crawlee/playwright'; +import type { PlaywrightGotoOptions, PlaywrightRequestHandler, Request } from '@crawlee/playwright'; +import { PlaywrightCrawler, RequestList } from '@crawlee/playwright'; import express from 'express'; import playwright from 'playwright'; import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator'; @@ -124,11 +117,12 @@ describe('PlaywrightCrawler', () => { requestList, maxRequestRetries: 0, maxConcurrency: 1, - requestHandler: () => { - }, - preNavigationHooks: [(_context, gotoOptions) => { - options = gotoOptions; - }], + requestHandler: () => {}, + preNavigationHooks: [ + (_context, gotoOptions) => { + options = gotoOptions; + }, + ], navigationTimeoutSecs: timeoutSecs, }); diff --git a/test/core/crawlers/puppeteer_crawler.test.ts b/test/core/crawlers/puppeteer_crawler.test.ts index a8f762147f07..7de9d2aa6779 100644 --- a/test/core/crawlers/puppeteer_crawler.test.ts +++ b/test/core/crawlers/puppeteer_crawler.test.ts @@ -6,18 +6,8 @@ import os from 'os'; import { promisify } from 'util'; import log from '@apify/log'; -import type { - PuppeteerCrawlingContext, - PuppeteerGoToOptions, - Request, -} from '@crawlee/puppeteer'; -import { - ProxyConfiguration, - PuppeteerCrawler, - RequestList, - RequestQueue, - Session, -} from '@crawlee/puppeteer'; +import type { PuppeteerCrawlingContext, PuppeteerGoToOptions, Request } from '@crawlee/puppeteer'; +import { ProxyConfiguration, PuppeteerCrawler, RequestList, RequestQueue, Session } from '@crawlee/puppeteer'; import type { Cookie } from '@crawlee/types'; import { sleep } from '@crawlee/utils'; import type { Server as ProxyChainServer } from 'proxy-chain'; @@ -106,7 +96,12 @@ describe('PuppeteerCrawler', () => { asserts.push(response.status() === 200); request.userData.title = await page.title(); processed.push(request); - asserts.push(!response.request().headers()['user-agent'].match(/headless/i)); + asserts.push( + !response + .request() + .headers() + ['user-agent'].match(/headless/i), + ); asserts.push(!(await page.evaluate(() => window.navigator.webdriver))); }; @@ -145,9 +140,11 @@ describe('PuppeteerCrawler', () => { maxRequestRetries: 0, maxConcurrency: 1, requestHandler: () => {}, - preNavigationHooks: [(_context, gotoOptions) => { - options = gotoOptions; - }], + preNavigationHooks: [ + (_context, gotoOptions) => { + options = gotoOptions; + }, + ], navigationTimeoutSecs: timeoutSecs, }); @@ -157,7 +154,8 @@ describe('PuppeteerCrawler', () => { test('should throw if launchOptions.proxyUrl is supplied', async () => { try { - new PuppeteerCrawler({ //eslint-disable-line + new PuppeteerCrawler({ + //eslint-disable-line requestList, maxRequestRetries: 0, maxConcurrency: 1, @@ -167,7 +165,9 @@ describe('PuppeteerCrawler', () => { requestHandler: () => {}, }); } catch (e) { - expect((e as Error).message).toMatch('PuppeteerCrawlerOptions.launchContext.proxyUrl is not allowed in PuppeteerCrawler.'); + expect((e as Error).message).toMatch( + 'PuppeteerCrawlerOptions.launchContext.proxyUrl is not allowed in PuppeteerCrawler.', + ); } expect.hasAssertions(); @@ -250,26 +250,14 @@ describe('PuppeteerCrawler', () => { expect(requestHandler).not.toBeCalled(); const warnings = logWarningSpy.mock.calls.map((call) => [call[0].split('\n')[0], call[1].retryCount]); expect(warnings).toEqual([ - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 1, - ], - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 2, - ], - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 3, - ], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 1], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 2], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 3], ]); const errors = logErrorSpy.mock.calls.map((call) => [call[0], call[1].retryCount]); expect(errors).toEqual([ - [ - 'Request failed and reached maximum retries. Navigation timed out after 0.005 seconds.', - undefined, - ], + ['Request failed and reached maximum retries. Navigation timed out after 0.005 seconds.', undefined], ]); }); @@ -307,26 +295,14 @@ describe('PuppeteerCrawler', () => { expect(requestHandler).not.toBeCalled(); const warnings = logWarningSpy.mock.calls.map((call) => [call[0].split('\n')[0], call[1].retryCount]); expect(warnings).toEqual([ - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 1, - ], - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 2, - ], - [ - 'Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', - 3, - ], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 1], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 2], + ['Reclaiming failed request back to the list or queue. Navigation timed out after 0.005 seconds.', 3], ]); const errors = logErrorSpy.mock.calls.map((call) => [call[0], call[1].retryCount]); expect(errors).toEqual([ - [ - 'Request failed and reached maximum retries. Navigation timed out after 0.005 seconds.', - undefined, - ], + ['Request failed and reached maximum retries. Navigation timed out after 0.005 seconds.', undefined], ]); }); diff --git a/test/core/enqueue_links/click_elements.test.ts b/test/core/enqueue_links/click_elements.test.ts index c55798a27fd6..07f407d46b15 100644 --- a/test/core/enqueue_links/click_elements.test.ts +++ b/test/core/enqueue_links/click_elements.test.ts @@ -53,12 +53,7 @@ const testCases = [ }, ]; -testCases.forEach(({ - caseName, - launchBrowser, - clickElements, - utils, -}) => { +testCases.forEach(({ caseName, launchBrowser, clickElements, utils }) => { describe(`${caseName}: enqueueLinksByClickingElements()`, () => { let browser: PPBrowser | PWBrowser; let server: Server; @@ -106,7 +101,7 @@ testCases.forEach(({ return request; }, waitForPageIdleSecs: 0.025, - maxWaitForPageIdleSecs: 0.250, + maxWaitForPageIdleSecs: 0.25, }); expect(enqueued).toHaveLength(1); expect(enqueued[0].url).toMatch(`${serverAddress}/`); @@ -115,7 +110,7 @@ testCases.forEach(({ }); test('accepts forefront option', async () => { - const addedRequests: {request: Source; options: RequestQueueOperationOptions}[] = []; + const addedRequests: { request: Source; options: RequestQueueOperationOptions }[] = []; const requestQueue = new RequestQueue({ id: 'xxx', client: Configuration.getStorageClient() }); requestQueue.addRequests = async (requests, options) => { addedRequests.push(...requests.map((request) => ({ request, options }))); @@ -137,7 +132,7 @@ testCases.forEach(({ requestQueue, selector: 'a', waitForPageIdleSecs: 0.025, - maxWaitForPageIdleSecs: 0.250, + maxWaitForPageIdleSecs: 0.25, forefront: true, }); expect(addedRequests).toHaveLength(2); @@ -337,10 +332,7 @@ testCases.forEach(({ expect( await page.evaluate(() => { const textarea = document.querySelector('textarea'); - return textarea.value.substring( - textarea.selectionStart, - textarea.selectionEnd, - ); + return textarea.value.substring(textarea.selectionStart, textarea.selectionEnd); }), ).toBe(text); }); @@ -366,9 +358,11 @@ testCases.forEach(({ `; await page.setContent(html); - const interceptedRequests = await clickElements.clickElementsAndInterceptNavigationRequests(getOpts({ - selector: 'a', - })); + const interceptedRequests = await clickElements.clickElementsAndInterceptNavigationRequests( + getOpts({ + selector: 'a', + }), + ); expect(interceptedRequests).toHaveLength(1); expect(interceptedRequests[0].url).toMatch(`${serverAddress}/`); expect(page.url()).toBe('about:blank'); @@ -503,7 +497,8 @@ testCases.forEach(({ }; (browser as PPBrowser).on('targetcreated', (target) => { counts.create++; - if ((clickElements as typeof puppeteerClickElements).isTargetRelevant(page, target)) spawnedTarget = target; + if ((clickElements as typeof puppeteerClickElements).isTargetRelevant(page, target)) + spawnedTarget = target; }); browser.on('targetdestroyed', (target) => { counts.destroy++; @@ -525,10 +520,16 @@ testCases.forEach(({ }); } - clickElements.clickElementsAndInterceptNavigationRequests(getOpts({ - waitForPageIdleMillis: 1000, - maxWaitForPageIdleMillis: 5000, - })).catch(() => { /* will throw because afterEach will close the page */ }); + clickElements + .clickElementsAndInterceptNavigationRequests( + getOpts({ + waitForPageIdleMillis: 1000, + maxWaitForPageIdleMillis: 5000, + }), + ) + .catch(() => { + /* will throw because afterEach will close the page */ + }); }); expect(callCounts.create).toBe(1); @@ -546,10 +547,12 @@ testCases.forEach(({ `; await page.setContent(html); - const interceptedRequests = await clickElements.clickElementsAndInterceptNavigationRequests(getOpts({ - waitForPageIdleMillis: 1000, - maxWaitForPageIdleMillis: 5000, - })); + const interceptedRequests = await clickElements.clickElementsAndInterceptNavigationRequests( + getOpts({ + waitForPageIdleMillis: 1000, + maxWaitForPageIdleMillis: 5000, + }), + ); await new Promise((r) => setTimeout(r, 1000)); expect(interceptedRequests).toHaveLength(1); expect(interceptedRequests[0].url).toBe(`${serverAddress}/`); diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index bbbecc95e8a5..420889a6d955 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -67,15 +67,12 @@ describe('enqueueLinks()', () => { log.setLevel(ll); }); - describe.each([ - [launchPuppeteer], - [launchPlaywright], - ] as const)('using %s', (method) => { + describe.each([[launchPuppeteer], [launchPlaywright]] as const)('using %s', (method) => { let browser: PuppeteerBrowser | PlaywrightBrowser; let page: PuppeteerPage | PlaywrightPage; beforeEach(async () => { - browser = await method({ launchOptions: { headless: true } }) as PlaywrightBrowser | PuppeteerBrowser; + browser = (await method({ launchOptions: { headless: true } })) as PlaywrightBrowser | PuppeteerBrowser; page = await browser.newPage(); await page.setContent(HTML); }); @@ -114,10 +111,7 @@ describe('enqueueLinks()', () => { test('works with globs', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', method: 'POST' as const }, - ]; + const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; await browserCrawlerEnqueueLinks({ options: { @@ -162,12 +156,14 @@ describe('enqueueLinks()', () => { { glob: '?(http|https)://cool.com/', method: 'POST' as const }, ]; - await expect(browserCrawlerEnqueueLinks({ - options: { selector: '.click', globs }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - })).resolves.not.toThrow(); + await expect( + browserCrawlerEnqueueLinks({ + options: { selector: '.click', globs }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).resolves.not.toThrow(); expect(enqueued).toHaveLength(3); }); @@ -232,10 +228,7 @@ describe('enqueueLinks()', () => { test('works with exclude glob', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', method: 'POST' as const }, - ]; + const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; const exclude = ['**/first']; @@ -273,10 +266,7 @@ describe('enqueueLinks()', () => { test('works with exclude regexp', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const globs = [ - 'https://example.com/**/*', - { glob: '?(http|https)://cool.com/', method: 'POST' as const }, - ]; + const globs = ['https://example.com/**/*', { glob: '?(http|https)://cool.com/', method: 'POST' as const }]; const exclude = [/first/]; @@ -353,18 +343,17 @@ describe('enqueueLinks()', () => { test('throws with RegExp pseudoUrls', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - /https:\/\/example\.com\/(\w|-|\/)*/, - /(http|https):\/\/cool\.com\//, - ]; + const pseudoUrls = [/https:\/\/example\.com\/(\w|-|\/)*/, /(http|https):\/\/cool\.com\//]; - await expect(browserCrawlerEnqueueLinks({ - // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' - options: { selector: '.click', pseudoUrls }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/to be of type `string` but received type `RegExp`/); + await expect( + browserCrawlerEnqueueLinks({ + // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' + options: { selector: '.click', pseudoUrls }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/to be of type `string` but received type `RegExp`/); }); test('works with undefined pseudoUrls[]', async () => { @@ -398,12 +387,14 @@ describe('enqueueLinks()', () => { test('throws with null pseudoUrls[]', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - await expect(browserCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls: null }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); + await expect( + browserCrawlerEnqueueLinks({ + options: { selector: '.click', pseudoUrls: null }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); }); test('works with empty pseudoUrls[]', async () => { @@ -436,18 +427,16 @@ describe('enqueueLinks()', () => { test('throws with sparse pseudoUrls[]', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - null, - '[http|https]://cool.com/', - ]; - - await expect(browserCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls }, - page, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); + const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', null, '[http|https]://cool.com/']; + + await expect( + browserCrawlerEnqueueLinks({ + options: { selector: '.click', pseudoUrls }, + page, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); expect(enqueued).toHaveLength(0); }); @@ -542,10 +531,7 @@ describe('enqueueLinks()', () => { test('correctly works with transformRequestFunction', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - '[http|https]://cool.com/', - ]; + const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', '[http|https]://cool.com/']; await browserCrawlerEnqueueLinks({ options: { @@ -639,12 +625,14 @@ describe('enqueueLinks()', () => { { glob: ' ' }, ]; - await expect(cheerioCrawlerEnqueueLinks({ - options: { selector: '.click', globs }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - })).resolves.not.toThrow(); + await expect( + cheerioCrawlerEnqueueLinks({ + options: { selector: '.click', globs }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).resolves.not.toThrow(); expect(enqueued).toHaveLength(3); }); @@ -728,18 +716,17 @@ describe('enqueueLinks()', () => { test('throws with RegExp pseudoUrls', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - /https:\/\/example\.com\/(\w|-|\/)*/, - /(http|https):\/\/cool\.com\//, - ]; + const pseudoUrls = [/https:\/\/example\.com\/(\w|-|\/)*/, /(http|https):\/\/cool\.com\//]; - await expect(cheerioCrawlerEnqueueLinks({ - // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' - options: { selector: '.click', pseudoUrls }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/to be of type `string` but received type `RegExp`/); + await expect( + cheerioCrawlerEnqueueLinks({ + // @ts-expect-error Type 'RegExp[]' is not assignable to type 'PseudoUrlInput[]' + options: { selector: '.click', pseudoUrls }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/to be of type `string` but received type `RegExp`/); }); test('works with undefined pseudoUrls[]', async () => { @@ -772,12 +759,14 @@ describe('enqueueLinks()', () => { test('throws with null pseudoUrls[]', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - await expect(cheerioCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls: null }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); + await expect( + cheerioCrawlerEnqueueLinks({ + options: { selector: '.click', pseudoUrls: null }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/Expected property `pseudoUrls` to be of type `array` but received type `null`/); }); test('works with empty pseudoUrls[]', async () => { @@ -810,18 +799,16 @@ describe('enqueueLinks()', () => { test('throws with sparse pseudoUrls[]', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - null, - '[http|https]://cool.com/', - ]; - - await expect(cheerioCrawlerEnqueueLinks({ - options: { selector: '.click', pseudoUrls }, - $, - requestQueue, - originalRequestUrl: 'https://example.com', - })).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); + const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', null, '[http|https]://cool.com/']; + + await expect( + cheerioCrawlerEnqueueLinks({ + options: { selector: '.click', pseudoUrls }, + $, + requestQueue, + originalRequestUrl: 'https://example.com', + }), + ).rejects.toThrow(/\(array `pseudoUrls`\) Any predicate failed with the following errors/); expect(enqueued).toHaveLength(0); }); @@ -938,10 +925,7 @@ describe('enqueueLinks()', () => { test('correctly works with transformRequestFunction', async () => { const { enqueued, requestQueue } = createRequestQueueMock(); - const pseudoUrls = [ - 'https://example.com/[(\\w|-|/)*]', - '[http|https]://cool.com/', - ]; + const pseudoUrls = ['https://example.com/[(\\w|-|/)*]', '[http|https]://cool.com/']; await cheerioCrawlerEnqueueLinks({ options: { diff --git a/test/core/enqueue_links/shared.test.ts b/test/core/enqueue_links/shared.test.ts index a0f13905d55f..3f0fd3239652 100644 --- a/test/core/enqueue_links/shared.test.ts +++ b/test/core/enqueue_links/shared.test.ts @@ -88,7 +88,9 @@ describe('Enqueue links shared functions', () => { }; const requestOptions = createRequestOptions(sources); - const requests = createRequests(requestOptions, urlPatternObjects).map(transformRequestFunction).filter((r) => !!r); + const requests = createRequests(requestOptions, urlPatternObjects) + .map(transformRequestFunction) + .filter((r) => !!r); expect(requests).toHaveLength(2); requests.forEach((r) => { @@ -106,7 +108,9 @@ describe('Enqueue links shared functions', () => { const globPattern = 'https://example.com/**/*'; expect(() => validateGlobPattern(globPattern)).not.toThrow(); const emptyGlobPattern = ''; - expect(() => validateGlobPattern(emptyGlobPattern)).toThrow(/Cannot parse Glob pattern '': it must be an non-empty string/); + expect(() => validateGlobPattern(emptyGlobPattern)).toThrow( + /Cannot parse Glob pattern '': it must be an non-empty string/, + ); }); }); }); diff --git a/test/core/error_tracker.test.ts b/test/core/error_tracker.test.ts index d51dec7e8a00..ade00956e7f0 100644 --- a/test/core/error_tracker.test.ts +++ b/test/core/error_tracker.test.ts @@ -20,7 +20,11 @@ const s = (stack: string) => { const atIndex = stack.indexOf('at '); const index = evalIndex === -1 ? atIndex : evalIndex; - return stack.slice(index).split('\n').map((line) => line.trim()).join('\n'); + return stack + .slice(index) + .split('\n') + .map((line) => line.trim()) + .join('\n'); }; // A case for @@ -100,10 +104,14 @@ test('works', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name - [e.message]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message]: { + // message count: 1, }, }, @@ -115,10 +123,14 @@ test('works', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name - [e.message]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message]: { + // message count: 2, }, }, @@ -144,10 +156,14 @@ test('no code is null code', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - 'missing error code': { // code - [e.name]: { // name - [e.message]: { // message + 'myscript.js:10:3': { + // source + 'missing error code': { + // code + [e.name]: { + // name + [e.message]: { + // message count: 2, }, }, @@ -173,9 +189,12 @@ test('can hide error code', () => { tracker.add(g(errorRandomCode)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.name]: { // name - [e.message]: { // message + 'myscript.js:10:3': { + // source + [e.name]: { + // name + [e.message]: { + // message count: 2, }, }, @@ -200,9 +219,12 @@ test('can hide error name', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.message]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.message]: { + // message count: 2, }, }, @@ -227,9 +249,12 @@ test('can hide error message', () => { tracker.add(g(errorRandomMessage)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name count: 2, }, }, @@ -252,9 +277,12 @@ test('can hide error stack', () => { tracker.add(g(errorRandomStack)); expect(tracker.result).toMatchObject({ - 'ERR_INVALID_URL': { // code - 'TypeError': { // name - 'Invalid URL': { // message + ERR_INVALID_URL: { + // code + TypeError: { + // name + 'Invalid URL': { + // message count: 2, }, }, @@ -279,10 +307,14 @@ test('can display full stack', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - [s(e.stack)]: { // source - [e.code]: { // code - [e.name]: { // name - [e.message]: { // message + [s(e.stack)]: { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message]: { + // message count: 2, }, }, @@ -308,10 +340,14 @@ test('stack looks for user files first', () => { tracker.add(g(e)); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name - [e.message]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message]: { + // message count: 2, }, }, @@ -336,10 +372,14 @@ test('can shorten the message to the first line', () => { tracker.add(e); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name - [e.message.split('\n')[0]]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message.split('\n')[0]]: { + // message count: 1, }, }, @@ -365,10 +405,14 @@ test('supports error.cause', () => { tracker.add(e); expect(tracker.result).toMatchObject({ - 'myscript.js:10:3': { // source - [e.code]: { // code - [e.name]: { // name - [e.message.split('\n')[0]]: { // message + 'myscript.js:10:3': { + // source + [e.code]: { + // code + [e.name]: { + // name + [e.message.split('\n')[0]]: { + // message count: 1, }, [e.cause.message]: { @@ -407,10 +451,14 @@ test('placeholder #1', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Expected boolean, got _': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Expected boolean, got _': { + // message count: 3, }, }, @@ -446,10 +494,14 @@ test('placeholder #2', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Expected `boolean`, got _': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Expected `boolean`, got _': { + // message count: 3, }, }, @@ -485,10 +537,14 @@ test('placeholder #3', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - '1 _ 3': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + '1 _ 3': { + // message count: 3, }, }, @@ -524,10 +580,14 @@ test('placeholder #4', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - '1 2 _': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + '1 2 _': { + // message count: 3, }, }, @@ -563,10 +623,14 @@ test('placeholder #5', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - '_ 2 3': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + '_ 2 3': { + // message count: 3, }, }, @@ -602,10 +666,14 @@ test('placeholder #6', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'The weather is _ today, _ the grass is _': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'The weather is _ today, _ the grass is _': { + // message count: 3, }, }, @@ -636,10 +704,14 @@ test('placeholder #7', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Expected `boolean`, got `number`': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Expected `boolean`, got `number`': { + // message count: 2, }, }, @@ -654,10 +726,14 @@ test('placeholder #7', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Expected `boolean`, got _': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Expected `boolean`, got _': { + // message count: 3, }, }, @@ -688,10 +764,14 @@ test('placeholder #8', () => { }); expect(tracker.result).toMatchObject({ - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Expected `boolean`, got `number`': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Expected `boolean`, got `number`': { + // message count: 1, }, 'Expected `string`, got `null`': { @@ -725,10 +805,14 @@ test('placeholder #9', () => { }); const expected = { - 'missing stack trace': { // source - 'missing error code': { // code - Error: { // name - 'Unexpected `show` property in `options` object': { // message + 'missing stack trace': { + // source + 'missing error code': { + // code + Error: { + // name + 'Unexpected `show` property in `options` object': { + // message count: 1, }, 'Missing `display` in style': { diff --git a/test/core/playwright_utils.test.ts b/test/core/playwright_utils.test.ts index ca29ff875d6b..b6bb52fe3bb3 100644 --- a/test/core/playwright_utils.test.ts +++ b/test/core/playwright_utils.test.ts @@ -39,294 +39,317 @@ describe('playwrightUtils', () => { await localStorageEmulator.destroy(); }); - describe.each([ - [launchPlaywright, { launchOptions: { headless: true } }], - ] as const)('with %s', (launchName, launchContext) => { - test('injectFile()', async () => { - const browser2 = await launchName(launchContext); - const survive = async (browser: Browser) => { - // Survive navigations - const page = await browser.newPage(); - // @ts-expect-error - let result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await playwrightUtils.injectFile(page, path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), { surviveNavigations: true }); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto('about:chrome'); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto(serverAddress); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - }; - const remove = async (browser: Browser) => { - // Remove with navigations - const page = await browser.newPage(); - // @ts-expect-error - let result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await page.goto('about:chrome'); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await playwrightUtils.injectFile(page, path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt')); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto(serverAddress); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - }; - try { - await Promise.all([survive(browser2), remove(browser2)]); - } finally { - await browser2.close(); - } - }); - - test('injectJQuery()', async () => { - const browser = await launchName(launchContext); - - try { - const page = await browser.newPage(); - await page.goto('about:blank'); - - // NOTE: Chrome already defines window.$ as alias to document.querySelector(), - // (https://developers.google.com/web/tools/chrome-devtools/console/command-line-reference#queryselector) - const result1 = await page.evaluate(() => { - return { - // @ts-expect-error - isDefined: window.jQuery !== undefined, - }; - }); - expect(result1).toEqual({ - isDefined: false, - }); + describe.each([[launchPlaywright, { launchOptions: { headless: true } }]] as const)( + 'with %s', + (launchName, launchContext) => { + test('injectFile()', async () => { + const browser2 = await launchName(launchContext); + const survive = async (browser: Browser) => { + // Survive navigations + const page = await browser.newPage(); + // @ts-expect-error + let result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await playwrightUtils.injectFile( + page, + path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), + { surviveNavigations: true }, + ); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto('about:chrome'); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto(serverAddress); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + }; + const remove = async (browser: Browser) => { + // Remove with navigations + const page = await browser.newPage(); + // @ts-expect-error + let result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await page.goto('about:chrome'); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await playwrightUtils.injectFile( + page, + path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), + ); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto(serverAddress); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + }; + try { + await Promise.all([survive(browser2), remove(browser2)]); + } finally { + await browser2.close(); + } + }); - await playwrightUtils.injectJQuery(page); + test('injectJQuery()', async () => { + const browser = await launchName(launchContext); + + try { + const page = await browser.newPage(); + await page.goto('about:blank'); + + // NOTE: Chrome already defines window.$ as alias to document.querySelector(), + // (https://developers.google.com/web/tools/chrome-devtools/console/command-line-reference#queryselector) + const result1 = await page.evaluate(() => { + return { + // @ts-expect-error + isDefined: window.jQuery !== undefined, + }; + }); + expect(result1).toEqual({ + isDefined: false, + }); + + await playwrightUtils.injectJQuery(page); + + const result2 = await page.evaluate(() => { + /* global $ */ + return { + // @ts-expect-error + isDefined: window.jQuery === window.$, + // @ts-expect-error + text: $('h1').text(), + }; + }); + expect(result2).toEqual({ + isDefined: true, + text: '', + }); + + await page.reload(); + + const result3 = await page.evaluate(() => { + return { + // @ts-expect-error + isDefined: window.jQuery === window.$, + // @ts-expect-error + text: $('h1').text(), + }; + }); + expect(result3).toEqual({ + isDefined: true, + text: '', + }); + } finally { + await browser.close(); + } + }); - const result2 = await page.evaluate(() => { - /* global $ */ - return { - // @ts-expect-error - isDefined: window.jQuery === window.$, - // @ts-expect-error - text: $('h1').text(), - }; - }); - expect(result2).toEqual({ - isDefined: true, - text: '', - }); + test('parseWithCheerio() works', async () => { + const browser = await launchName(launchContext); + + try { + const page = await browser.newPage(); + await page.goto(serverAddress); - await page.reload(); + const $ = await playwrightUtils.parseWithCheerio(page); - const result3 = await page.evaluate(() => { - return { - // @ts-expect-error - isDefined: window.jQuery === window.$, - // @ts-expect-error - text: $('h1').text(), - }; + const title = $('h1').text().trim(); + expect(title).toBe('Example Domain'); + } finally { + await browser.close(); + } + }); + + describe('blockRequests()', () => { + let browser: Browser = null; + beforeAll(async () => { + browser = await launchName(launchContext); }); - expect(result3).toEqual({ - isDefined: true, - text: '', + afterAll(async () => { + await browser.close(); }); - } finally { - await browser.close(); - } - }); - - test('parseWithCheerio() works', async () => { - const browser = await launchName(launchContext); - - try { - const page = await browser.newPage(); - await page.goto(serverAddress); - - const $ = await playwrightUtils.parseWithCheerio(page); - - const title = $('h1').text().trim(); - expect(title).toBe('Example Domain'); - } finally { - await browser.close(); - } - }); - - describe('blockRequests()', () => { - let browser: Browser = null; - beforeAll(async () => { - browser = await launchName(launchContext); - }); - afterAll(async () => { - await browser.close(); - }); - test('works with default values', async () => { - const loadedUrls: string[] = []; + test('works with default values', async () => { + const loadedUrls: string[] = []; - const page = await browser.newPage(); - await playwrightUtils.blockRequests(page); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.setContent(` + const page = await browser.newPage(); + await playwrightUtils.blockRequests(page); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.setContent( + ` - `, { waitUntil: 'load' }); - expect(loadedUrls).toEqual([`${serverAddress}/script.js`]); - }); + `, + { waitUntil: 'load' }, + ); + expect(loadedUrls).toEqual([`${serverAddress}/script.js`]); + }); - test('works with overridden values', async () => { - const loadedUrls: string[] = []; + test('works with overridden values', async () => { + const loadedUrls: string[] = []; - const page = await browser.newPage(); - await playwrightUtils.blockRequests(page, { - urlPatterns: ['.css'], - }); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.setContent(` + const page = await browser.newPage(); + await playwrightUtils.blockRequests(page, { + urlPatterns: ['.css'], + }); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.setContent( + ` - `, { waitUntil: 'load' }); - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/image.png`, - `${serverAddress}/script.js`, - `${serverAddress}/image.gif`, - ])); - }); - }); - - test('gotoExtended() works', async () => { - const browser = await chromium.launch({ headless: true }); - - try { - const page = await browser.newPage(); - const request = new Request({ - url: `${serverAddress}/special/getDebug`, - method: 'POST', - headers: { - 'Content-Type': 'application/json; charset=utf-8', - }, - payload: '{ "foo": "bar" }', + `, + { waitUntil: 'load' }, + ); + expect(loadedUrls).toEqual( + expect.arrayContaining([ + `${serverAddress}/image.png`, + `${serverAddress}/script.js`, + `${serverAddress}/image.gif`, + ]), + ); }); - - const response = await playwrightUtils.gotoExtended(page, request); - - const { method, headers, bodyLength } = JSON.parse(await response.text()); - expect(method).toBe('POST'); - expect(bodyLength).toBe(16); - expect(headers['content-type']).toBe('application/json; charset=utf-8'); - } finally { - await browser.close(); - } - }, 60_000); - - describe('infiniteScroll()', () => { - function isAtBottom() { - return (window.innerHeight + window.pageYOffset) >= document.body.offsetHeight; - } - - let browser: Browser; - beforeAll(async () => { - browser = await chromium.launch({ headless: true }); - }); - afterAll(async () => { - await browser.close(); }); - let page: Page; - beforeEach(async () => { - page = await browser.newPage(); - let count = 0; - const content = Array(1000).fill(null).map(() => { - return `
Div number: ${count++}
`; + test('gotoExtended() works', async () => { + const browser = await chromium.launch({ headless: true }); + + try { + const page = await browser.newPage(); + const request = new Request({ + url: `${serverAddress}/special/getDebug`, + method: 'POST', + headers: { + 'Content-Type': 'application/json; charset=utf-8', + }, + payload: '{ "foo": "bar" }', + }); + + const response = await playwrightUtils.gotoExtended(page, request); + + const { method, headers, bodyLength } = JSON.parse(await response.text()); + expect(method).toBe('POST'); + expect(bodyLength).toBe(16); + expect(headers['content-type']).toBe('application/json; charset=utf-8'); + } finally { + await browser.close(); + } + }, 60_000); + + describe('infiniteScroll()', () => { + function isAtBottom() { + return window.innerHeight + window.pageYOffset >= document.body.offsetHeight; + } + + let browser: Browser; + beforeAll(async () => { + browser = await chromium.launch({ headless: true }); + }); + afterAll(async () => { + await browser.close(); }); - const contentHTML = `${content}`; - await page.setContent(contentHTML); - }); - afterEach(async () => { - await page.close(); - }); - test('works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + let page: Page; + beforeEach(async () => { + page = await browser.newPage(); + let count = 0; + const content = Array(1000) + .fill(null) + .map(() => { + return `
Div number: ${count++}
`; + }); + const contentHTML = `${content}`; + await page.setContent(contentHTML); + }); + afterEach(async () => { + await page.close(); + }); - await playwrightUtils.infiniteScroll(page, { waitForSecs: 0 }); + test('works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); - const after = await page.evaluate(isAtBottom); - expect(after).toBe(true); - }); + await playwrightUtils.infiniteScroll(page, { waitForSecs: 0 }); - test('maxScrollHeight works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + const after = await page.evaluate(isAtBottom); + expect(after).toBe(true); + }); - await playwrightUtils.infiniteScroll(page, { - waitForSecs: Infinity, - maxScrollHeight: 1000, - stopScrollCallback: async () => true, + test('maxScrollHeight works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); + + await playwrightUtils.infiniteScroll(page, { + waitForSecs: Infinity, + maxScrollHeight: 1000, + stopScrollCallback: async () => true, + }); + + const after = await page.evaluate(isAtBottom); + // It scrolls to the bottom in the first scroll so this is correct. + // The test passes because the Infinite waitForSecs is broken by the height requirement. + // If it didn't, the test would time out. + expect(after).toBe(true); }); - const after = await page.evaluate(isAtBottom); - // It scrolls to the bottom in the first scroll so this is correct. - // The test passes because the Infinite waitForSecs is broken by the height requirement. - // If it didn't, the test would time out. - expect(after).toBe(true); - }); + test('stopScrollCallback works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); - test('stopScrollCallback works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + await playwrightUtils.infiniteScroll(page, { + waitForSecs: Infinity, + stopScrollCallback: async () => true, + }); - await playwrightUtils.infiniteScroll(page, { - waitForSecs: Infinity, - stopScrollCallback: async () => true, + const after = await page.evaluate(isAtBottom); + expect(after).toBe(true); }); + }); - const after = await page.evaluate(isAtBottom); - expect(after).toBe(true); + test('saveSnapshot() works', async () => { + const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); + const browser = await chromium.launch({ headless: true }); + + try { + const page = await browser.newPage(); + const contentHTML = + '
Div number: 1
'; + await page.setContent(contentHTML); + + const screenshot = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 60 }); + + // Test saving both image and html + const object = { setValue: vitest.fn() }; + openKVSSpy.mockResolvedValue(object as any); + await playwrightUtils.saveSnapshot(page, { + key: 'TEST', + keyValueStoreName: 'TEST-STORE', + screenshotQuality: 60, + }); + + expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); + object.setValue.mockReset(); + + // Test saving only image + await playwrightUtils.saveSnapshot(page, { saveHtml: false }); + + // Default quality is 50 + const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); + expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); + } finally { + await browser.close(); + } }); - }); - - test('saveSnapshot() works', async () => { - const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); - const browser = await chromium.launch({ headless: true }); - - try { - const page = await browser.newPage(); - const contentHTML = '
Div number: 1
'; - await page.setContent(contentHTML); - - const screenshot = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 60 }); - - // Test saving both image and html - const object = { setValue: vitest.fn() }; - openKVSSpy.mockResolvedValue(object as any); - await playwrightUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - - expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); - object.setValue.mockReset(); - - // Test saving only image - await playwrightUtils.saveSnapshot(page, { saveHtml: false }); - - // Default quality is 50 - const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); - } finally { - await browser.close(); - } - }); - }); + }, + ); }); diff --git a/test/core/proxy_configuration.test.ts b/test/core/proxy_configuration.test.ts index 4cdea896a637..8877e404e515 100644 --- a/test/core/proxy_configuration.test.ts +++ b/test/core/proxy_configuration.test.ts @@ -40,8 +40,14 @@ describe('ProxyConfiguration', () => { }); test('newUrlFunction should correctly generate URLs', async () => { - const customUrls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333', - 'http://proxy.com:4444', 'http://proxy.com:5555', 'http://proxy.com:6666']; + const customUrls = [ + 'http://proxy.com:1111', + 'http://proxy.com:2222', + 'http://proxy.com:3333', + 'http://proxy.com:4444', + 'http://proxy.com:5555', + 'http://proxy.com:6666', + ]; const newUrlFunction = () => { return customUrls.pop(); }; @@ -61,8 +67,14 @@ describe('ProxyConfiguration', () => { }); test('async newUrlFunction should work correctly', async () => { - const customUrls = ['http://proxy.com:1111', 'http://proxy.com:2222', 'http://proxy.com:3333', - 'http://proxy.com:4444', 'http://proxy.com:5555', 'http://proxy.com:6666']; + const customUrls = [ + 'http://proxy.com:1111', + 'http://proxy.com:2222', + 'http://proxy.com:3333', + 'http://proxy.com:4444', + 'http://proxy.com:5555', + 'http://proxy.com:6666', + ]; const newUrlFunction = async () => { await new Promise((r) => setTimeout(r, 5)); return customUrls.pop(); diff --git a/test/core/puppeteer_request_interception.test.ts b/test/core/puppeteer_request_interception.test.ts index dde96259b856..380483f71c5f 100644 --- a/test/core/puppeteer_request_interception.test.ts +++ b/test/core/puppeteer_request_interception.test.ts @@ -56,15 +56,15 @@ describe('utils.puppeteer.addInterceptRequestHandler|removeInterceptRequestHandl await browser.close(); } - expect(allUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/script.js`, - `${serverAddress}/style.css`, - `${serverAddress}/image.png`, - ])); - - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - ])); + expect(allUrls).toEqual( + expect.arrayContaining([ + `${serverAddress}/script.js`, + `${serverAddress}/style.css`, + `${serverAddress}/image.png`, + ]), + ); + + expect(loadedUrls).toEqual(expect.arrayContaining([`${serverAddress}/style.css`])); }); test('should not propagate aborted/responded requests to following handlers', async () => { @@ -100,9 +100,7 @@ describe('utils.puppeteer.addInterceptRequestHandler|removeInterceptRequestHandl await browser.close(); } - expect(propagatedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - ])); + expect(propagatedUrls).toEqual(expect.arrayContaining([`${serverAddress}/style.css`])); }); test('should allow to modify request', async () => { @@ -175,7 +173,7 @@ describe('utils.puppeteer.addInterceptRequestHandler|removeInterceptRequestHandl // Override headers const headers = { ...request.headers(), - 'accept': 'text/html', + accept: 'text/html', 'accept-language': 'en-GB', 'upgrade-insecure-requests': '2', }; @@ -193,7 +191,9 @@ describe('utils.puppeteer.addInterceptRequestHandler|removeInterceptRequestHandl expect(typeof acceptLanguageIndex).toBe('number'); expect(rawHeadersArr[acceptLanguageIndex + 1]).toEqual('en-GB'); - const upgradeInsReqIndex = rawHeadersArr.findIndex((headerItem) => headerItem === 'Upgrade-Insecure-Requests'); + const upgradeInsReqIndex = rawHeadersArr.findIndex( + (headerItem) => headerItem === 'Upgrade-Insecure-Requests', + ); expect(typeof upgradeInsReqIndex).toBe('number'); expect(rawHeadersArr[upgradeInsReqIndex + 1]).toEqual('2'); @@ -233,28 +233,27 @@ describe('utils.puppeteer.removeInterceptRequestHandler()', () => { // Load with scripts and images disabled. await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'networkidle0' }); - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - ])); + expect(loadedUrls).toEqual(expect.arrayContaining([`${serverAddress}/style.css`])); // Try it once again. await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'networkidle0' }); - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - `${serverAddress}/style.css`, - ])); + expect(loadedUrls).toEqual( + expect.arrayContaining([`${serverAddress}/style.css`, `${serverAddress}/style.css`]), + ); // Enable images. await removeInterceptRequestHandler(page, abortImagesHandler); // Try to load once again if images appear there. await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'networkidle0' }); - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - `${serverAddress}/style.css`, - `${serverAddress}/style.css`, - `${serverAddress}/image.png`, - ])); + expect(loadedUrls).toEqual( + expect.arrayContaining([ + `${serverAddress}/style.css`, + `${serverAddress}/style.css`, + `${serverAddress}/style.css`, + `${serverAddress}/image.png`, + ]), + ); } finally { await browser.close(); } diff --git a/test/core/puppeteer_utils.test.ts b/test/core/puppeteer_utils.test.ts index 4103ba99b36d..af56c561dbcd 100644 --- a/test/core/puppeteer_utils.test.ts +++ b/test/core/puppeteer_utils.test.ts @@ -39,405 +39,416 @@ describe('puppeteerUtils', () => { await localStorageEmulator.destroy(); }); - describe.each([ - [launchPuppeteer, { launchOptions: { headless: true } }], - ] as const)('with %s', (method, launchContext) => { - test('injectFile()', async () => { - const browser2 = await method(launchContext); - const survive = async (browser: Browser) => { - // Survive navigations - const page = await browser.newPage(); - // @ts-expect-error - let result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await puppeteerUtils.injectFile(page, path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), { surviveNavigations: true }); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto('about:chrome'); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto(serverAddress); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - }; - const remove = async (browser: Browser) => { - // Remove with navigations - const page = await browser.newPage(); - // @ts-expect-error - let result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await page.goto('about:chrome'); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - await puppeteerUtils.injectFile(page, path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt')); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable); - expect(result).toBe(42); - await page.goto(serverAddress); - // @ts-expect-error - result = await page.evaluate(() => window.injectedVariable === 42); - expect(result).toBe(false); - }; - try { - await Promise.all([survive(browser2), remove(browser2)]); - } finally { - await browser2.close(); - } - }); - - test('injectJQuery()', async () => { - const browser = await method(launchContext); - - try { - const page = await browser.newPage(); - await page.goto('about:blank'); - - // NOTE: Chrome already defines window.$ as alias to document.querySelector(), - // (https://developers.google.com/web/tools/chrome-devtools/console/command-line-reference#queryselector) - const result1 = await page.evaluate(() => { - return { - // @ts-expect-error - isDefined: window.jQuery !== undefined, - }; - }); - expect(result1).toEqual({ - isDefined: false, - }); + describe.each([[launchPuppeteer, { launchOptions: { headless: true } }]] as const)( + 'with %s', + (method, launchContext) => { + test('injectFile()', async () => { + const browser2 = await method(launchContext); + const survive = async (browser: Browser) => { + // Survive navigations + const page = await browser.newPage(); + // @ts-expect-error + let result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await puppeteerUtils.injectFile( + page, + path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), + { surviveNavigations: true }, + ); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto('about:chrome'); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto(serverAddress); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + }; + const remove = async (browser: Browser) => { + // Remove with navigations + const page = await browser.newPage(); + // @ts-expect-error + let result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await page.goto('about:chrome'); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + await puppeteerUtils.injectFile( + page, + path.join(__dirname, '..', 'shared', 'data', 'inject_file.txt'), + ); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable); + expect(result).toBe(42); + await page.goto(serverAddress); + // @ts-expect-error + result = await page.evaluate(() => window.injectedVariable === 42); + expect(result).toBe(false); + }; + try { + await Promise.all([survive(browser2), remove(browser2)]); + } finally { + await browser2.close(); + } + }); + + test('injectJQuery()', async () => { + const browser = await method(launchContext); + + try { + const page = await browser.newPage(); + await page.goto('about:blank'); + + // NOTE: Chrome already defines window.$ as alias to document.querySelector(), + // (https://developers.google.com/web/tools/chrome-devtools/console/command-line-reference#queryselector) + const result1 = await page.evaluate(() => { + return { + // @ts-expect-error + isDefined: window.jQuery !== undefined, + }; + }); + expect(result1).toEqual({ + isDefined: false, + }); + + await puppeteerUtils.injectJQuery(page); + const result2 = await page.evaluate(() => { + /* global $ */ + return { + // @ts-expect-error + isDefined: window.jQuery === window.$, + // @ts-expect-error + text: $('h1').text(), + }; + }); + expect(result2).toEqual({ + isDefined: true, + text: '', + }); + + await page.reload(); + + const result3 = await page.evaluate(() => { + return { + // @ts-expect-error + isDefined: window.jQuery === window.$, + // @ts-expect-error + text: $('h1').text(), + }; + }); + expect(result3).toEqual({ + isDefined: true, + text: '', + }); + } finally { + await browser.close(); + } + }); - await puppeteerUtils.injectJQuery(page); - const result2 = await page.evaluate(() => { - /* global $ */ - return { - // @ts-expect-error - isDefined: window.jQuery === window.$, - // @ts-expect-error - text: $('h1').text(), - }; + test('parseWithCheerio() works', async () => { + const browser = await method(launchContext); + + try { + const page = await browser.newPage(); + await page.goto(serverAddress); + + const $ = await puppeteerUtils.parseWithCheerio(page); + + const title = $('h1').text().trim(); + expect(title).toBe('Example Domain'); + } finally { + await browser.close(); + } + }); + + describe('blockRequests()', () => { + let browser: Browser = null; + beforeAll(async () => { + browser = await method(launchContext); }); - expect(result2).toEqual({ - isDefined: true, - text: '', + afterAll(async () => { + await browser.close(); }); - await page.reload(); + test('works with default values', async () => { + const loadedUrls: string[] = []; - const result3 = await page.evaluate(() => { - return { - // @ts-expect-error - isDefined: window.jQuery === window.$, - // @ts-expect-error - text: $('h1').text(), - }; + const page = await browser.newPage(); + await puppeteerUtils.blockRequests(page); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); + expect(loadedUrls).toEqual([`${serverAddress}/special/resources`, `${serverAddress}/script.js`]); }); - expect(result3).toEqual({ - isDefined: true, - text: '', + + test('works with overridden values', async () => { + const loadedUrls: string[] = []; + + const page = await browser.newPage(); + await puppeteerUtils.blockRequests(page, { + urlPatterns: ['.css'], + }); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); + + expect(loadedUrls).toEqual( + expect.arrayContaining([ + `${serverAddress}/image.png`, + `${serverAddress}/script.js`, + `${serverAddress}/image.gif`, + ]), + ); }); - } finally { - await browser.close(); - } - }); - - test('parseWithCheerio() works', async () => { - const browser = await method(launchContext); - - try { - const page = await browser.newPage(); - await page.goto(serverAddress); - - const $ = await puppeteerUtils.parseWithCheerio(page); - - const title = $('h1').text().trim(); - expect(title).toBe('Example Domain'); - } finally { - await browser.close(); - } - }); - - describe('blockRequests()', () => { - let browser: Browser = null; - beforeAll(async () => { - browser = await method(launchContext); - }); - afterAll(async () => { - await browser.close(); - }); - test('works with default values', async () => { - const loadedUrls: string[] = []; - - const page = await browser.newPage(); - await puppeteerUtils.blockRequests(page); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); - expect(loadedUrls).toEqual([ - `${serverAddress}/special/resources`, - `${serverAddress}/script.js`, - ]); - }); + test('blockResources() supports default values', async () => { + const loadedUrls: string[] = []; - test('works with overridden values', async () => { - const loadedUrls: string[] = []; + const page = await browser.newPage(); + await puppeteerUtils.blockResources(page); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); - const page = await browser.newPage(); - await puppeteerUtils.blockRequests(page, { - urlPatterns: ['.css'], + expect(loadedUrls).toEqual(expect.arrayContaining([`${serverAddress}/script.js`])); }); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); - - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/image.png`, - `${serverAddress}/script.js`, - `${serverAddress}/image.gif`, - ])); - }); - test('blockResources() supports default values', async () => { - const loadedUrls: string[] = []; + test('blockResources() supports nondefault values', async () => { + const loadedUrls: string[] = []; - const page = await browser.newPage(); - await puppeteerUtils.blockResources(page); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); + const page = await browser.newPage(); + await puppeteerUtils.blockResources(page, ['script']); + page.on('response', (response) => loadedUrls.push(response.url())); + await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/script.js`, - ])); + expect(loadedUrls).toEqual( + expect.arrayContaining([`${serverAddress}/style.css`, `${serverAddress}/image.png`]), + ); + }); }); - test('blockResources() supports nondefault values', async () => { - const loadedUrls: string[] = []; + test('supports cacheResponses()', async () => { + const browser = await method(launchContext); + const cache: Dictionary> = {}; + + const getResourcesLoadedFromWiki = async () => { + let downloadedBytes = 0; + const page = await browser.newPage(); + await page.setDefaultNavigationTimeout(0); + // Cache all javascript files, png files and svg files + await puppeteerUtils.cacheResponses(page, cache, ['.js', /.+\.png/i, /.+\.svg/i]); + page.on('response', async (response) => { + if (cache[response.url()]) return; + try { + const buffer = await response.buffer(); + downloadedBytes += buffer.byteLength; + } catch (e) { + // do nothing + } + }); + await page.goto(`${serverAddress}/cacheable`, { waitUntil: 'networkidle0', timeout: 60e3 }); + await page.close(); + return downloadedBytes; + }; - const page = await browser.newPage(); - await puppeteerUtils.blockResources(page, ['script']); - page.on('response', (response) => loadedUrls.push(response.url())); - await page.goto(`${serverAddress}/special/resources`, { waitUntil: 'load' }); - - expect(loadedUrls).toEqual(expect.arrayContaining([ - `${serverAddress}/style.css`, - `${serverAddress}/image.png`, - ])); + try { + const bytesDownloadedOnFirstRun = await getResourcesLoadedFromWiki(); + const bytesDownloadedOnSecondRun = await getResourcesLoadedFromWiki(); + expect(bytesDownloadedOnSecondRun).toBeLessThan(bytesDownloadedOnFirstRun); + } finally { + await browser.close(); + } }); - }); - - test('supports cacheResponses()', async () => { - const browser = await method(launchContext); - const cache: Dictionary> = {}; - - const getResourcesLoadedFromWiki = async () => { - let downloadedBytes = 0; - const page = await browser.newPage(); - await page.setDefaultNavigationTimeout(0); - // Cache all javascript files, png files and svg files - await puppeteerUtils.cacheResponses(page, cache, ['.js', /.+\.png/i, /.+\.svg/i]); - page.on('response', async (response) => { - if (cache[response.url()]) return; + + test('cacheResponses() throws when rule with invalid type is provided', async () => { + const mockedPage = { + setRequestInterception: () => {}, + on: () => {}, + }; + + const testRuleType = async (value: string | RegExp) => { try { - const buffer = await response.buffer(); - downloadedBytes += buffer.byteLength; - } catch (e) { - // do nothing + await puppeteerUtils.cacheResponses(mockedPage as any, {}, [value]); + } catch (error) { + // this is valid path for this test + return; } - }); - await page.goto(`${serverAddress}/cacheable`, { waitUntil: 'networkidle0', timeout: 60e3 }); - await page.close(); - return downloadedBytes; - }; - - try { - const bytesDownloadedOnFirstRun = await getResourcesLoadedFromWiki(); - const bytesDownloadedOnSecondRun = await getResourcesLoadedFromWiki(); - expect(bytesDownloadedOnSecondRun).toBeLessThan(bytesDownloadedOnFirstRun); - } finally { - await browser.close(); - } - }); - - test('cacheResponses() throws when rule with invalid type is provided', async () => { - const mockedPage = { - setRequestInterception: () => {}, - on: () => {}, - }; - - const testRuleType = async (value: string | RegExp) => { + + expect(`Rule '${value}' should have thrown error`).toBe(''); + }; + + // @ts-expect-error + await testRuleType(0); + // @ts-expect-error + await testRuleType(1); + await testRuleType(null); + // @ts-expect-error + await testRuleType([]); + // @ts-expect-error + await testRuleType(['']); + // @ts-expect-error + await testRuleType(() => {}); + }); + + test('compileScript() works', async () => { + const { compileScript } = puppeteerUtils; + const scriptStringGood = 'await page.goto("about:blank"); return await page.content();'; + const scriptStringBad = 'for const while'; + const script = compileScript(scriptStringGood); + + expect(typeof script).toBe('function'); + expect(script.toString()).toEqual(`async ({ page, request }) => {${scriptStringGood}}`); + try { - await puppeteerUtils.cacheResponses(mockedPage as any, {}, [value]); - } catch (error) { - // this is valid path for this test - return; + compileScript(scriptStringBad); + throw new Error('Should fail.'); + } catch (err) { + // TODO figure out why the err.message comes out empty in the logs. + expect((err as Error).message).toMatch(/Unexpected token '?const'?/); } + const browser = await method(launchContext); + try { + const page = await browser.newPage(); + const content = await script({ page } as any); + expect(typeof content).toBe('string'); + expect(content).toBe(''); + } finally { + await browser.close(); + } + }); - expect(`Rule '${value}' should have thrown error`).toBe(''); - }; - - // @ts-expect-error - await testRuleType(0); - // @ts-expect-error - await testRuleType(1); - await testRuleType(null); - // @ts-expect-error - await testRuleType([]); - // @ts-expect-error - await testRuleType(['']); - // @ts-expect-error - await testRuleType(() => {}); - }); - - test('compileScript() works', async () => { - const { compileScript } = puppeteerUtils; - const scriptStringGood = 'await page.goto("about:blank"); return await page.content();'; - const scriptStringBad = 'for const while'; - const script = compileScript(scriptStringGood); - - expect(typeof script).toBe('function'); - expect(script.toString()).toEqual(`async ({ page, request }) => {${scriptStringGood}}`); - - try { - compileScript(scriptStringBad); - throw new Error('Should fail.'); - } catch (err) { - // TODO figure out why the err.message comes out empty in the logs. - expect((err as Error).message).toMatch(/Unexpected token '?const'?/); - } - const browser = await method(launchContext); - try { - const page = await browser.newPage(); - const content = await script({ page } as any); - expect(typeof content).toBe('string'); - expect(content).toBe(''); - } finally { - await browser.close(); - } - }); - - test('gotoExtended() works', async () => { - const browser = await method(launchContext); - - try { - const page = await browser.newPage(); - const request = new Request({ - url: `${serverAddress}/special/getDebug`, - method: 'POST', - headers: { - 'Content-Type': 'application/json; charset=utf-8', - }, - payload: '{ "foo": "bar" }', - }); + test('gotoExtended() works', async () => { + const browser = await method(launchContext); - const response = await puppeteerUtils.gotoExtended(page, request); - - // eslint-disable-next-line @typescript-eslint/no-shadow - const { method, headers, bodyLength } = JSON.parse(await response.text()); - expect(method).toBe('POST'); - expect(bodyLength).toBe(16); - expect(headers['content-type']).toBe('application/json; charset=utf-8'); - } finally { - await browser.close(); - } - }); - - describe('infiniteScroll()', () => { - function isAtBottom() { - return (window.innerHeight + window.pageYOffset) >= document.body.offsetHeight; - } - - let browser: Browser; - beforeAll(async () => { - browser = await launchPuppeteer({ launchOptions: { headless: true } }); - }); - afterAll(async () => { - await browser.close(); + try { + const page = await browser.newPage(); + const request = new Request({ + url: `${serverAddress}/special/getDebug`, + method: 'POST', + headers: { + 'Content-Type': 'application/json; charset=utf-8', + }, + payload: '{ "foo": "bar" }', + }); + + const response = await puppeteerUtils.gotoExtended(page, request); + + // eslint-disable-next-line @typescript-eslint/no-shadow + const { method, headers, bodyLength } = JSON.parse(await response.text()); + expect(method).toBe('POST'); + expect(bodyLength).toBe(16); + expect(headers['content-type']).toBe('application/json; charset=utf-8'); + } finally { + await browser.close(); + } }); - let page: Page; - beforeEach(async () => { - page = await browser.newPage(); - let count = 0; - const content = Array(1000).fill(null).map(() => { - return `
Div number: ${count++}
`; + describe('infiniteScroll()', () => { + function isAtBottom() { + return window.innerHeight + window.pageYOffset >= document.body.offsetHeight; + } + + let browser: Browser; + beforeAll(async () => { + browser = await launchPuppeteer({ launchOptions: { headless: true } }); + }); + afterAll(async () => { + await browser.close(); }); - const contentHTML = `${content}`; - await page.setContent(contentHTML); - }); - afterEach(async () => { - await page.close(); - }); - test('works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + let page: Page; + beforeEach(async () => { + page = await browser.newPage(); + let count = 0; + const content = Array(1000) + .fill(null) + .map(() => { + return `
Div number: ${count++}
`; + }); + const contentHTML = `${content}`; + await page.setContent(contentHTML); + }); + afterEach(async () => { + await page.close(); + }); - await puppeteerUtils.infiniteScroll(page, { waitForSecs: 0 }); + test('works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); - const after = await page.evaluate(isAtBottom); - expect(after).toBe(true); - }); + await puppeteerUtils.infiniteScroll(page, { waitForSecs: 0 }); - test('maxScrollHeight works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + const after = await page.evaluate(isAtBottom); + expect(after).toBe(true); + }); - await puppeteerUtils.infiniteScroll(page, { - waitForSecs: Infinity, - maxScrollHeight: 1000, - stopScrollCallback: async () => true, + test('maxScrollHeight works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); + + await puppeteerUtils.infiniteScroll(page, { + waitForSecs: Infinity, + maxScrollHeight: 1000, + stopScrollCallback: async () => true, + }); + + const after = await page.evaluate(isAtBottom); + // It scrolls to the bottom in the first scroll so this is correct. + // The test passes because the Infinite waitForSecs is broken by the height requirement. + // If it didn't, the test would time out. + expect(after).toBe(true); }); - const after = await page.evaluate(isAtBottom); - // It scrolls to the bottom in the first scroll so this is correct. - // The test passes because the Infinite waitForSecs is broken by the height requirement. - // If it didn't, the test would time out. - expect(after).toBe(true); - }); + test('stopScrollCallback works', async () => { + const before = await page.evaluate(isAtBottom); + expect(before).toBe(false); - test('stopScrollCallback works', async () => { - const before = await page.evaluate(isAtBottom); - expect(before).toBe(false); + await puppeteerUtils.infiniteScroll(page, { + waitForSecs: Infinity, + stopScrollCallback: async () => true, + }); - await puppeteerUtils.infiniteScroll(page, { - waitForSecs: Infinity, - stopScrollCallback: async () => true, + const after = await page.evaluate(isAtBottom); + expect(after).toBe(true); }); + }); - const after = await page.evaluate(isAtBottom); - expect(after).toBe(true); + it('saveSnapshot() works', async () => { + const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); + const browser = await method(launchContext); + + try { + const page = await browser.newPage(); + const contentHTML = + '
Div number: 1
'; + await page.setContent(contentHTML); + + const screenshot = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 60 }); + + // Test saving both image and html + const object = { setValue: vitest.fn() }; + openKVSSpy.mockResolvedValue(object as any); + await puppeteerUtils.saveSnapshot(page, { + key: 'TEST', + keyValueStoreName: 'TEST-STORE', + screenshotQuality: 60, + }); + + expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); + expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); + object.setValue.mockReset(); + + // Test saving only image + await puppeteerUtils.saveSnapshot(page, { saveHtml: false }); + + // Default quality is 50 + const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); + expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); + } finally { + await browser.close(); + } }); - }); - - it('saveSnapshot() works', async () => { - const openKVSSpy = vitest.spyOn(KeyValueStore, 'open'); - const browser = await method(launchContext); - - try { - const page = await browser.newPage(); - const contentHTML = '
Div number: 1
'; - await page.setContent(contentHTML); - - const screenshot = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 60 }); - - // Test saving both image and html - const object = { setValue: vitest.fn() }; - openKVSSpy.mockResolvedValue(object as any); - await puppeteerUtils.saveSnapshot(page, { key: 'TEST', keyValueStoreName: 'TEST-STORE', screenshotQuality: 60 }); - - expect(object.setValue).toBeCalledWith('TEST.jpg', screenshot, { contentType: 'image/jpeg' }); - expect(object.setValue).toBeCalledWith('TEST.html', contentHTML, { contentType: 'text/html' }); - object.setValue.mockReset(); - - // Test saving only image - await puppeteerUtils.saveSnapshot(page, { saveHtml: false }); - - // Default quality is 50 - const screenshot2 = await page.screenshot({ fullPage: true, type: 'jpeg', quality: 50 }); - expect(object.setValue).toBeCalledWith('SNAPSHOT.jpg', screenshot2, { contentType: 'image/jpeg' }); - } finally { - await browser.close(); - } - }); - }); + }, + ); }); diff --git a/test/core/request_list.test.ts b/test/core/request_list.test.ts index 539695b885fa..543dfa7c1348 100644 --- a/test/core/request_list.test.ts +++ b/test/core/request_list.test.ts @@ -1,12 +1,20 @@ import log from '@apify/log'; -import { Configuration, deserializeArray, EventType, KeyValueStore, ProxyConfiguration, Request, RequestList } from '@crawlee/core'; +import { + Configuration, + deserializeArray, + EventType, + KeyValueStore, + ProxyConfiguration, + Request, + RequestList, +} from '@crawlee/core'; import { sleep } from '@crawlee/utils'; import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator'; /** * Stand-in for underscore.js shuffle (weird, but how else?) */ -function shuffle(array: unknown[]) : unknown[] { +function shuffle(array: unknown[]): unknown[] { const out = [...array]; for (let i = out.length - 1; i > 0; i--) { const j = Math.floor(Math.random() * (i + 1)); @@ -132,15 +140,8 @@ describe('RequestList', () => { test('should correctly load list from hosted files in correct order', async () => { const spy = vitest.spyOn(RequestList.prototype as any, '_downloadListOfUrls'); - const list1 = [ - 'https://example.com', - 'https://google.com', - 'https://wired.com', - ]; - const list2 = [ - 'https://another.com', - 'https://page.com', - ]; + const list1 = ['https://example.com', 'https://google.com', 'https://wired.com']; + const list2 = ['https://another.com', 'https://page.com']; spy.mockImplementationOnce(() => new Promise((resolve) => setTimeout(resolve(list1) as any, 100)) as any); spy.mockResolvedValueOnce(list2); @@ -185,11 +186,7 @@ describe('RequestList', () => { }); test('should fix gdoc sharing url in `requestsFromUrl` automatically (GH issue #639)', async () => { - const list = [ - 'https://example.com', - 'https://google.com', - 'https://wired.com', - ]; + const list = ['https://example.com', 'https://google.com', 'https://wired.com']; const wrongUrls = [ 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU', 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/', @@ -198,7 +195,8 @@ describe('RequestList', () => { 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/?q=blablabla', 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/edit#gid=0', ]; - const correctUrl = 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/gviz/tq?tqx=out:csv'; + const correctUrl = + 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/gviz/tq?tqx=out:csv'; gotScrapingSpy.mockResolvedValue({ body: JSON.stringify(list) } as any); @@ -233,10 +231,7 @@ describe('RequestList', () => { }); test('should use the defined proxy server when using `requestsFromUrl`', async () => { - const proxyUrls = [ - 'http://proxyurl.usedforthe.download', - 'http://another.proxy.url', - ]; + const proxyUrls = ['http://proxyurl.usedforthe.download', 'http://another.proxy.url']; const spy = vitest.spyOn(RequestList.prototype as any, '_downloadListOfUrls'); spy.mockResolvedValue([]); @@ -311,11 +306,7 @@ describe('RequestList', () => { await requestList.reclaimRequest(request4); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/3', - 'https://example.com/4', - 'https://example.com/5', - ], + inProgress: ['https://example.com/3', 'https://example.com/4', 'https://example.com/5'], nextIndex: 5, nextUniqueKey: 'https://example.com/6', }); @@ -330,10 +321,7 @@ describe('RequestList', () => { await requestList.markRequestHandled(request5); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/3', - 'https://example.com/4', - ], + inProgress: ['https://example.com/3', 'https://example.com/4'], nextIndex: 5, nextUniqueKey: 'https://example.com/6', }); @@ -353,9 +341,7 @@ describe('RequestList', () => { await requestList.markRequestHandled(request4); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/3', - ], + inProgress: ['https://example.com/3'], nextIndex: 5, nextUniqueKey: 'https://example.com/6', }); @@ -387,9 +373,7 @@ describe('RequestList', () => { expect(request6.url).toBe('https://example.com/6'); expect(await requestList.fetchNextRequest()).toBe(null); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/6', - ], + inProgress: ['https://example.com/6'], nextIndex: 6, nextUniqueKey: null, }); @@ -404,9 +388,7 @@ describe('RequestList', () => { await requestList.reclaimRequest(request6); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/6', - ], + inProgress: ['https://example.com/6'], nextIndex: 6, nextUniqueKey: null, }); @@ -422,9 +404,7 @@ describe('RequestList', () => { expect(reclaimed6.url).toBe('https://example.com/6'); expect(requestList.getState()).toEqual({ - inProgress: [ - 'https://example.com/6', - ], + inProgress: ['https://example.com/6'], nextIndex: 6, nextUniqueKey: null, }); @@ -527,11 +507,7 @@ describe('RequestList', () => { expect(requestList.areRequestsPersisted).toBe(true); const opts2 = { - sources: [ - { url: 'https://test.com/1' }, - { url: 'https://test.com/2' }, - { url: 'https://test.com/3' }, - ], + sources: [{ url: 'https://test.com/1' }, { url: 'https://test.com/2' }, { url: 'https://test.com/3' }], persistRequestsKey: PERSIST_REQUESTS_KEY, }; @@ -590,11 +566,7 @@ describe('RequestList', () => { const state = { nextIndex: 2, nextUniqueKey: 'https://www.anychart.com', - inProgress: [ - 'https://www.ams360.com', - 'https://www.anybus.com', - 'https://www.anychart.com', - ], + inProgress: ['https://www.ams360.com', 'https://www.anybus.com', 'https://www.anychart.com'], }; const requestList = await RequestList.open({ @@ -797,12 +769,7 @@ describe('RequestList', () => { }); test('should throw on invalid parameters', async () => { - const args = [ - [], - ['x', {}], - ['x', 6, {}], - ['x', [], []], - ] as const; + const args = [[], ['x', {}], ['x', 6, {}], ['x', [], []]] as const; for (const arg of args) { try { // @ts-ignore @@ -814,9 +781,10 @@ describe('RequestList', () => { if (e.message.match('argument to be of type `string`')) { expect(e.message).toMatch('received type `undefined`'); } else if (e.message.match('argument to be of type `array`')) { - const isMatched = e.message.match('received type `Object`') - || e.message.match('received type `number`') - || e.message.match('received type `undefined`'); + const isMatched = + e.message.match('received type `Object`') || + e.message.match('received type `number`') || + e.message.match('received type `undefined`'); expect(isMatched).toBeTruthy(); } else if (e.message.match('argument to be of type `null`')) { expect(e.message).toMatch('received type `undefined`'); diff --git a/test/core/router.test.ts b/test/core/router.test.ts index 3dc9f3270d41..d05ed45cf1ee 100644 --- a/test/core/router.test.ts +++ b/test/core/router.test.ts @@ -94,10 +94,10 @@ describe('Router', () => { const logs: string[] = []; // it should be possible to define router inline when creating router const router = Router.create({ - 'A': async (ctx) => { + A: async (ctx) => { logs.push(`label A handled with url ${ctx.request.loadedUrl}`); }, - 'B': async (ctx) => { + B: async (ctx) => { logs.push(`label B handled with url ${ctx.request.loadedUrl}`); }, }); @@ -127,7 +127,9 @@ describe('Router', () => { router.addHandler('A', async (ctx) => {}); expect(() => router.addHandler('A', async (ctx) => {})).toThrow(); const log = { info: vitest.fn(), warn: vitest.fn(), debug: vitest.fn() }; - await expect(router({ request: { loadedUrl: 'https://example.com/C', label: 'C' }, log } as any)).rejects.toThrow(MissingRouteError); + await expect( + router({ request: { loadedUrl: 'https://example.com/C', label: 'C' }, log } as any), + ).rejects.toThrow(MissingRouteError); router.addDefaultHandler(async (ctx) => {}); expect(() => router.addDefaultHandler(async (ctx) => {})).toThrow(); }); @@ -150,7 +152,7 @@ describe('Router', () => { test('addHandler accepts userdata generic', async () => { const testType = (t: T): void => {}; - const router: Router> = { + const router: Router> = { addHandler: () => {}, addDefaultHandler: () => {}, } as any; @@ -159,7 +161,7 @@ describe('Router', () => { testType<'foo'>(ctx.request.userData.foo); }); - router.addHandler<{foo: 'bar'}>('2', (ctx) => { + router.addHandler<{ foo: 'bar' }>('2', (ctx) => { testType<'bar'>(ctx.request.userData.foo); }); @@ -167,7 +169,7 @@ describe('Router', () => { testType<'foo'>(ctx.request.userData.foo); }); - router.addDefaultHandler<{foo: 'bar'}>((ctx) => { + router.addDefaultHandler<{ foo: 'bar' }>((ctx) => { testType<'bar'>(ctx.request.userData.foo); }); }); diff --git a/test/core/session_pool/session.test.ts b/test/core/session_pool/session.test.ts index f36a0bf578a2..e8f964a5f58f 100644 --- a/test/core/session_pool/session.test.ts +++ b/test/core/session_pool/session.test.ts @@ -67,7 +67,10 @@ describe('Session - testing session behaviour ', () => { let error; try { - session.setCookiesFromResponse({ headers: { Cookie: 'invaldi*{*{*{*-----***@s' }, url: 'http://localhost:1337' }); + session.setCookiesFromResponse({ + headers: { Cookie: 'invaldi*{*{*{*-----***@s' }, + url: 'http://localhost:1337', + }); } catch (e) { error = e; } @@ -182,7 +185,9 @@ describe('Session - testing session behaviour ', () => { sessionPool.blockedStatusCodes.forEach((status) => { const sess = new Session({ sessionPool }); let isCalled; - const call = () => { isCalled = true; }; + const call = () => { + isCalled = true; + }; sess.retire = call; expect(sess.retireOnBlockedStatusCodes(status)).toBeTruthy(); expect(isCalled).toBeTruthy(); @@ -214,9 +219,7 @@ describe('Session - testing session behaviour ', () => { test('setCookies should work for session (with expiration date: -1) cookies', () => { const url = 'https://example.com'; - const cookies = [ - { name: 'session_cookie', value: 'session-cookie-value', expires: -1 }, - ]; + const cookies = [{ name: 'session_cookie', value: 'session-cookie-value', expires: -1 }]; session = new Session({ sessionPool }); session.setCookies(cookies, url); diff --git a/test/core/session_pool/session_pool.test.ts b/test/core/session_pool/session_pool.test.ts index 7283dcc09d90..0f2430e0535b 100644 --- a/test/core/session_pool/session_pool.test.ts +++ b/test/core/session_pool/session_pool.test.ts @@ -45,16 +45,17 @@ describe('SessionPool - testing session pool', () => { persistStateKeyValueStoreId: 'TEST', persistStateKey: 'SESSION_POOL_STATE2', - createSessionFunction: () => ({} as never), - + createSessionFunction: () => ({}) as never, }; sessionPool = new SessionPool(opts); await sessionPool.initialize(); await sessionPool.teardown(); - entries(opts).filter(([key]) => key !== 'sessionOptions').forEach(([key, value]) => { - expect(sessionPool[key]).toEqual(value); - }); + entries(opts) + .filter(([key]) => key !== 'sessionOptions') + .forEach(([key, value]) => { + expect(sessionPool[key]).toEqual(value); + }); // log is appended to sessionOptions after sessionPool instantiation // @ts-expect-error private symbol expect(sessionPool.sessionOptions).toEqual({ ...opts.sessionOptions, log: expect.any(Log) }); @@ -72,15 +73,16 @@ describe('SessionPool - testing session pool', () => { persistStateKeyValueStoreId: 'TEST', persistStateKey: 'SESSION_POOL_STATE2', - createSessionFunction: () => ({} as never), - + createSessionFunction: () => ({}) as never, }; sessionPool = await SessionPool.open(opts); await sessionPool.teardown(); - entries(opts).filter(([key]) => key !== 'sessionOptions').forEach(([key, value]) => { - expect(sessionPool[key]).toEqual(value); - }); + entries(opts) + .filter(([key]) => key !== 'sessionOptions') + .forEach(([key, value]) => { + expect(sessionPool[key]).toEqual(value); + }); // log is appended to sessionOptions after sessionPool instantiation // @ts-expect-error private symbol expect(sessionPool.sessionOptions).toEqual({ ...opts.sessionOptions, log: expect.any(Log) }); @@ -111,7 +113,8 @@ describe('SessionPool - testing session pool', () => { const oldPick = sessionPool._pickSession; //eslint-disable-line // @ts-expect-error Overriding private property - sessionPool._pickSession = () => { //eslint-disable-line + sessionPool._pickSession = () => { + //eslint-disable-line isCalled = true; return oldPick.bind(sessionPool)(); }; @@ -164,7 +167,9 @@ describe('SessionPool - testing session pool', () => { const kvStore = await KeyValueStore.open(); // @ts-expect-error private symbol - const sessionPoolSaved = await kvStore.getValue>(sessionPool.persistStateKey); + const sessionPoolSaved = await kvStore.getValue>( + sessionPool.persistStateKey, + ); entries(sessionPoolSaved).forEach(([key, value]) => { if (key !== 'sessions') { @@ -382,7 +387,9 @@ describe('SessionPool - testing session pool', () => { await sessionPool.addSession({ id: 'test-session' }); await sessionPool.addSession({ id: 'test-session' }); } catch (e) { - expect((e as Error).message).toBe("Cannot add session with id 'test-session' as it already exists in the pool"); + expect((e as Error).message).toBe( + "Cannot add session with id 'test-session' as it already exists in the pool", + ); } expect.assertions(1); }); diff --git a/test/core/session_pool/session_utils.test.ts b/test/core/session_pool/session_utils.test.ts index 4a9d432355ed..aab3f1a98a44 100644 --- a/test/core/session_pool/session_utils.test.ts +++ b/test/core/session_pool/session_utils.test.ts @@ -5,7 +5,10 @@ import { Cookie } from 'tough-cookie'; describe('getCookiesFromResponse', () => { test('should parse cookies if set-cookie is array', () => { const headers: Dictionary = {}; - const dummyCookies = ['CSRF=e8b667; Domain=example.com; Secure', 'id=a3fWa; Expires=Wed, 21 Oct 2015 07:28:00 GMT']; + const dummyCookies = [ + 'CSRF=e8b667; Domain=example.com; Secure', + 'id=a3fWa; Expires=Wed, 21 Oct 2015 07:28:00 GMT', + ]; headers['set-cookie'] = dummyCookies; const cookies = getCookiesFromResponse({ headers }); diff --git a/test/core/storages/dataset.test.ts b/test/core/storages/dataset.test.ts index f6770d4f8570..cdc709d47d08 100644 --- a/test/core/storages/dataset.test.ts +++ b/test/core/storages/dataset.test.ts @@ -29,33 +29,23 @@ describe('dataset', () => { client: storageClient, }); - const pushItemSpy = vitest - .spyOn(dataset.client, 'pushItems'); + const pushItemSpy = vitest.spyOn(dataset.client, 'pushItems'); const mockPushItems = pushItemSpy.mockResolvedValueOnce(null); await dataset.pushData({ foo: 'bar' }); expect(mockPushItems).toBeCalledTimes(1); - expect(mockPushItems).toBeCalledWith( - JSON.stringify({ foo: 'bar' }), - ); + expect(mockPushItems).toBeCalledWith(JSON.stringify({ foo: 'bar' })); const mockPushItems2 = pushItemSpy.mockResolvedValueOnce(null); - await dataset.pushData([ - { foo: 'hotel;' }, - { foo: 'restaurant' }, - ]); + await dataset.pushData([{ foo: 'hotel;' }, { foo: 'restaurant' }]); expect(mockPushItems2).toBeCalledTimes(2); - expect(mockPushItems2).toBeCalledWith( - JSON.stringify([{ foo: 'hotel;' }, { foo: 'restaurant' }]), - ); + expect(mockPushItems2).toBeCalledWith(JSON.stringify([{ foo: 'hotel;' }, { foo: 'restaurant' }])); - const mockDelete = vitest - .spyOn(dataset.client, 'delete') - .mockResolvedValueOnce(undefined); + const mockDelete = vitest.spyOn(dataset.client, 'delete').mockResolvedValueOnce(undefined); await dataset.drop(); @@ -75,10 +65,7 @@ describe('dataset', () => { mockPushItems.mockResolvedValueOnce(null); mockPushItems.mockResolvedValueOnce(null); - await dataset.pushData([ - { foo: half }, - { bar: half }, - ]); + await dataset.pushData([{ foo: half }, { bar: half }]); expect(mockPushItems).toBeCalledTimes(2); expect(mockPushItems).toHaveBeenNthCalledWith(1, JSON.stringify([{ foo: half }])); @@ -124,13 +111,7 @@ describe('dataset', () => { const full = mockData(MAX_PAYLOAD_SIZE_BYTES); const dataset = new Dataset({ id: 'some-id', client: storageClient }); try { - await dataset.pushData([ - { foo: 0 }, - { foo: 1 }, - { foo: 2 }, - { foo: full }, - { foo: 4 }, - ]); + await dataset.pushData([{ foo: 0 }, { foo: 1 }, { foo: 2 }, { foo: full }, { foo: 4 }]); throw new Error('Should fail!'); } catch (err) { expect(err).toBeInstanceOf(Error); @@ -145,10 +126,7 @@ describe('dataset', () => { }); const expected = { - items: [ - { foo: 'bar' }, - { foo: 'hotel' }, - ], + items: [{ foo: 'bar' }, { foo: 'hotel' }], limit: 2, total: 1000, offset: 3, @@ -168,14 +146,17 @@ describe('dataset', () => { expect(result).toEqual(expected); let e; - const spy = vitest.spyOn(dataset.client, 'listItems') - .mockImplementation(() => { throw new Error('Cannot create a string longer than 0x3fffffe7 characters'); }); + const spy = vitest.spyOn(dataset.client, 'listItems').mockImplementation(() => { + throw new Error('Cannot create a string longer than 0x3fffffe7 characters'); + }); try { await dataset.getData(); } catch (err) { e = err; } - expect((e as Error).message).toEqual('dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.'); + expect((e as Error).message).toEqual( + 'dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.', + ); }); test('getInfo() should work', async () => { @@ -205,10 +186,7 @@ describe('dataset', () => { }); const firstResolve = { - items: [ - { foo: 'a' }, - { foo: 'b' }, - ], + items: [{ foo: 'a' }, { foo: 'b' }], limit: 2, total: 4, offset: 0, @@ -217,10 +195,7 @@ describe('dataset', () => { }; const secondResolve = { - items: [ - { foo: 'c' }, - { foo: 'd' }, - ], + items: [{ foo: 'c' }, { foo: 'd' }], limit: 2, total: 4, offset: 2, @@ -252,19 +227,17 @@ describe('dataset', () => { const items: Dictionary[] = []; const indexes: number[] = []; - const result = await dataset.forEach((item, index) => { - items.push(item); - indexes.push(index); - }, { - limit: 2, - }); + const result = await dataset.forEach( + (item, index) => { + items.push(item); + indexes.push(index); + }, + { + limit: 2, + }, + ); expect(result).toEqual(undefined); - expect(items).toEqual([ - { foo: 'a' }, - { foo: 'b' }, - { foo: 'c' }, - { foo: 'd' }, - ]); + expect(items).toEqual([{ foo: 'a' }, { foo: 'b' }, { foo: 'c' }, { foo: 'd' }]); expect(indexes).toEqual([0, 1, 2, 3]); restoreAndVerify(); @@ -273,11 +246,14 @@ describe('dataset', () => { test('map() should work', async () => { const { dataset, restoreAndVerify } = getRemoteDataset(); - const result = await dataset.map((item, index) => { - return { index, bar: 'xxx', ...item }; - }, { - limit: 2, - }); + const result = await dataset.map( + (item, index) => { + return { index, bar: 'xxx', ...item }; + }, + { + limit: 2, + }, + ); expect(result).toEqual([ { foo: 'a', index: 0, bar: 'xxx' }, @@ -292,12 +268,15 @@ describe('dataset', () => { test('map() should support promises', async () => { const { dataset, restoreAndVerify } = getRemoteDataset(); - const result = await dataset.map(async (item, index) => { - const res = { index, bar: 'xxx', ...item }; - return Promise.resolve(res); - }, { - limit: 2, - }); + const result = await dataset.map( + async (item, index) => { + const res = { index, bar: 'xxx', ...item }; + return Promise.resolve(res); + }, + { + limit: 2, + }, + ); expect(result).toEqual([ { foo: 'a', index: 0, bar: 'xxx' }, @@ -312,14 +291,18 @@ describe('dataset', () => { test('reduce() should work', async () => { const { dataset, restoreAndVerify } = getRemoteDataset(); - const result = await dataset.reduce((memo, item, index) => { - item.index = index; - item.bar = 'xxx'; + const result = await dataset.reduce( + (memo, item, index) => { + item.index = index; + item.bar = 'xxx'; - return memo.concat(item); - }, [], { - limit: 2, - }); + return memo.concat(item); + }, + [], + { + limit: 2, + }, + ); expect(result).toEqual([ { foo: 'a', index: 0, bar: 'xxx' }, @@ -334,14 +317,18 @@ describe('dataset', () => { test('reduce() should support promises', async () => { const { dataset, restoreAndVerify } = getRemoteDataset(); - const result = await dataset.reduce(async (memo, item, index) => { - item.index = index; - item.bar = 'xxx'; + const result = await dataset.reduce( + async (memo, item, index) => { + item.index = index; + item.bar = 'xxx'; - return Promise.resolve(memo.concat(item)); - }, [], { - limit: 2, - }); + return Promise.resolve(memo.concat(item)); + }, + [], + { + limit: 2, + }, + ); expect(result).toEqual([ { foo: 'a', index: 0, bar: 'xxx' }, @@ -361,10 +348,7 @@ describe('dataset', () => { }); const mockListItems = vitest.spyOn(dataset.client, 'listItems'); mockListItems.mockResolvedValueOnce({ - items: [ - { foo: 4 }, - { foo: 5 }, - ], + items: [{ foo: 4 }, { foo: 5 }], limit: 2, total: 4, offset: 0, @@ -372,10 +356,7 @@ describe('dataset', () => { desc: false, }); mockListItems.mockResolvedValueOnce({ - items: [ - { foo: 4 }, - { foo: 1 }, - ], + items: [{ foo: 4 }, { foo: 1 }], limit: 2, total: 4, offset: 2, @@ -385,12 +366,16 @@ describe('dataset', () => { const calledForIndexes: number[] = []; - const result = await dataset.reduce(async (memo, item, index) => { - calledForIndexes.push(index); - return Promise.resolve(memo.foo > item.foo ? memo : item); - }, undefined, { - limit: 2, - }); + const result = await dataset.reduce( + async (memo, item, index) => { + calledForIndexes.push(index); + return Promise.resolve(memo.foo > item.foo ? memo : item); + }, + undefined, + { + limit: 2, + }, + ); expect(mockListItems).toBeCalledTimes(2); expect(mockListItems).toHaveBeenNthCalledWith(1, { @@ -414,16 +399,28 @@ describe('dataset', () => { client: storageClient, }); // @ts-expect-error JS-side validation - await expect(dataset.pushData()).rejects.toThrow('Expected `data` to be of type `object` but received type `undefined`'); + await expect(dataset.pushData()).rejects.toThrow( + 'Expected `data` to be of type `object` but received type `undefined`', + ); // @ts-expect-error JS-side validation - await expect(dataset.pushData('')).rejects.toThrow('Expected `data` to be of type `object` but received type `string`'); + await expect(dataset.pushData('')).rejects.toThrow( + 'Expected `data` to be of type `object` but received type `string`', + ); // @ts-expect-error JS-side validation - await expect(dataset.pushData(123)).rejects.toThrow('Expected `data` to be of type `object` but received type `number`'); + await expect(dataset.pushData(123)).rejects.toThrow( + 'Expected `data` to be of type `object` but received type `number`', + ); // @ts-expect-error JS-side validation - await expect(dataset.pushData(true)).rejects.toThrow('Expected `data` to be of type `object` but received type `boolean`'); + await expect(dataset.pushData(true)).rejects.toThrow( + 'Expected `data` to be of type `object` but received type `boolean`', + ); // @ts-expect-error JS-side validation - await expect(dataset.pushData(false)).rejects.toThrow('Expected `data` to be of type `object` but received type `boolean`'); - await expect(dataset.pushData(() => {})).rejects.toThrow('Data item is not an object. You can push only objects into a dataset.'); + await expect(dataset.pushData(false)).rejects.toThrow( + 'Expected `data` to be of type `object` but received type `boolean`', + ); + await expect(dataset.pushData(() => {})).rejects.toThrow( + 'Data item is not an object. You can push only objects into a dataset.', + ); const circularObj = {} as Dictionary; circularObj.xxx = circularObj; @@ -478,8 +475,12 @@ describe('dataset', () => { expect(chunkBySize(triple, size + 1)).toEqual(triple); expect(chunkBySize(triple, size + 2)).toEqual([chunk, chunk, chunk]); // Chunks smaller items together - expect(chunkBySize(triple, (2 * size) + 3)).toEqual([`[${json},${json}]`, chunk]); - expect(chunkBySize([...triple, ...triple], (2 * size) + 3)).toEqual([`[${json},${json}]`, `[${json},${json}]`, `[${json},${json}]`]); + expect(chunkBySize(triple, 2 * size + 3)).toEqual([`[${json},${json}]`, chunk]); + expect(chunkBySize([...triple, ...triple], 2 * size + 3)).toEqual([ + `[${json},${json}]`, + `[${json},${json}]`, + `[${json},${json}]`, + ]); }); describe('exportToJSON', () => { diff --git a/test/core/storages/key_value_store.test.ts b/test/core/storages/key_value_store.test.ts index dd01ab1d2c09..31941ebbe069 100644 --- a/test/core/storages/key_value_store.test.ts +++ b/test/core/storages/key_value_store.test.ts @@ -105,10 +105,16 @@ describe('KeyValueStore', () => { }); // @ts-expect-error JS-side validation - await expect(store.getValue()).rejects.toThrow('Expected argument to be of type `string` but received type `undefined`'); + await expect(store.getValue()).rejects.toThrow( + 'Expected argument to be of type `string` but received type `undefined`', + ); // @ts-expect-error JS-side validation - await expect(store.getValue({})).rejects.toThrow('Expected argument to be of type `string` but received type `Object`'); - await expect(store.getValue(null)).rejects.toThrow('Expected argument to be of type `string` but received type `null`'); + await expect(store.getValue({})).rejects.toThrow( + 'Expected argument to be of type `string` but received type `Object`', + ); + await expect(store.getValue(null)).rejects.toThrow( + 'Expected argument to be of type `string` but received type `null`', + ); await expect(store.getValue('')).rejects.toThrow('Expected string to not be empty'); }); @@ -136,10 +142,16 @@ describe('KeyValueStore', () => { }); // @ts-expect-error JS-side validation - await expect(store.recordExists()).rejects.toThrow('Expected argument to be of type `string` but received type `undefined`'); + await expect(store.recordExists()).rejects.toThrow( + 'Expected argument to be of type `string` but received type `undefined`', + ); // @ts-expect-error JS-side validation - await expect(store.recordExists({})).rejects.toThrow('Expected argument to be of type `string` but received type `Object`'); - await expect(store.recordExists(null)).rejects.toThrow('Expected argument to be of type `string` but received type `null`'); + await expect(store.recordExists({})).rejects.toThrow( + 'Expected argument to be of type `string` but received type `Object`', + ); + await expect(store.recordExists(null)).rejects.toThrow( + 'Expected argument to be of type `string` but received type `null`', + ); await expect(store.recordExists('')).rejects.toThrow('Expected string to not be empty'); }); @@ -162,60 +174,76 @@ describe('KeyValueStore', () => { }); // @ts-expect-error JS-side validation - await expect(store.setValue()).rejects.toThrow('Expected `key` to be of type `string` but received type `undefined`'); + await expect(store.setValue()).rejects.toThrow( + 'Expected `key` to be of type `string` but received type `undefined`', + ); await expect(store.setValue('', null)).rejects.toThrow('Expected string `key` to not be empty'); await expect(store.setValue('', 'some value')).rejects.toThrow('Expected string `key` to not be empty'); // @ts-expect-error JS-side validation - await expect(store.setValue({}, 'some value')) - .rejects.toThrow('Expected `key` to be of type `string` but received type `Object`'); + await expect(store.setValue({}, 'some value')).rejects.toThrow( + 'Expected `key` to be of type `string` but received type `Object`', + ); // @ts-expect-error JS-side validation - await expect(store.setValue(123, 'some value')) - .rejects.toThrow('Expected `key` to be of type `string` but received type `number`'); + await expect(store.setValue(123, 'some value')).rejects.toThrow( + 'Expected `key` to be of type `string` but received type `number`', + ); - const valueErrMsg = 'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified'; + const valueErrMsg = + 'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified'; await expect(store.setValue('key', {}, { contentType: 'image/png' })).rejects.toThrow(valueErrMsg); await expect(store.setValue('key', 12345, { contentType: 'image/png' })).rejects.toThrow(valueErrMsg); await expect(store.setValue('key', () => {}, { contentType: 'image/png' })).rejects.toThrow(valueErrMsg); // @ts-expect-error JS-side validation - await expect(store.setValue('key', {}, 123)) - .rejects.toThrow('Expected argument to be of type `object` but received type `number`'); + await expect(store.setValue('key', {}, 123)).rejects.toThrow( + 'Expected argument to be of type `object` but received type `number`', + ); // @ts-expect-error JS-side validation - await expect(store.setValue('key', {}, 'bla/bla')) - .rejects.toThrow('Expected argument to be of type `object` but received type `string`'); + await expect(store.setValue('key', {}, 'bla/bla')).rejects.toThrow( + 'Expected argument to be of type `object` but received type `string`', + ); // @ts-expect-error JS-side validation - await expect(store.setValue('key', {}, true)) - .rejects.toThrow('Expected argument to be of type `object` but received type `boolean`'); + await expect(store.setValue('key', {}, true)).rejects.toThrow( + 'Expected argument to be of type `object` but received type `boolean`', + ); const circularObj = {} as Dictionary; circularObj.xxx = circularObj; - const circularErrMsg = 'The "value" parameter cannot be stringified to JSON: Converting circular structure to JSON'; - const undefinedErrMsg = 'The "value" parameter was stringified to JSON and returned undefined. ' - + 'Make sure you\'re not trying to stringify an undefined value.'; + const circularErrMsg = + 'The "value" parameter cannot be stringified to JSON: Converting circular structure to JSON'; + const undefinedErrMsg = + 'The "value" parameter was stringified to JSON and returned undefined. ' + + "Make sure you're not trying to stringify an undefined value."; await expect(store.setValue('key', circularObj)).rejects.toThrow(circularErrMsg); await expect(store.setValue('key', undefined)).rejects.toThrow(undefinedErrMsg); // @ts-expect-error JS-side validation await expect(store.setValue('key')).rejects.toThrow(undefinedErrMsg); const contTypeRedundantErrMsg = 'Expected property string `contentType` to not be empty in object'; - await expect(store.setValue('key', null, { contentType: 'image/png' })) - .rejects.toThrow('The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.'); + await expect(store.setValue('key', null, { contentType: 'image/png' })).rejects.toThrow( + 'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', + ); await expect(store.setValue('key', null, { contentType: '' })).rejects.toThrow(contTypeRedundantErrMsg); // @ts-expect-error Type '{}' is not assignable to type 'string'. - await expect(store.setValue('key', null, { contentType: {} })) - .rejects.toThrow('The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.'); + await expect(store.setValue('key', null, { contentType: {} })).rejects.toThrow( + 'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.', + ); // @ts-expect-error Type 'number' is not assignable to type 'string'. - await expect(store.setValue('key', 'value', { contentType: 123 })) - .rejects.toThrow('Expected property `contentType` to be of type `string` but received type `number` in object'); + await expect(store.setValue('key', 'value', { contentType: 123 })).rejects.toThrow( + 'Expected property `contentType` to be of type `string` but received type `number` in object', + ); // @ts-expect-error Type '{}' is not assignable to type 'string'. - await expect(store.setValue('key', 'value', { contentType: {} })) - .rejects.toThrow('Expected property `contentType` to be of type `string` but received type `Object` in object'); + await expect(store.setValue('key', 'value', { contentType: {} })).rejects.toThrow( + 'Expected property `contentType` to be of type `string` but received type `Object` in object', + ); // @ts-expect-error Type 'Date' is not assignable to type 'string'. - await expect(store.setValue('key', 'value', { contentType: new Date() })) - .rejects.toThrow('Expected property `contentType` to be of type `string` but received type `Date` in object'); - await expect(store.setValue('key', 'value', { contentType: '' })) - .rejects.toThrow('Expected property string `contentType` to not be empty in object'); + await expect(store.setValue('key', 'value', { contentType: new Date() })).rejects.toThrow( + 'Expected property `contentType` to be of type `string` but received type `Date` in object', + ); + await expect(store.setValue('key', 'value', { contentType: '' })).rejects.toThrow( + 'Expected property string `contentType` to not be empty in object', + ); }); test('throws on invalid key', async () => { @@ -342,7 +370,7 @@ describe('KeyValueStore', () => { }); const mockSetRecord = vitest - // @ts-expect-error Accessing private property + // @ts-expect-error Accessing private property .spyOn(store.client, 'setRecord') .mockResolvedValueOnce(null); @@ -462,9 +490,12 @@ describe('KeyValueStore', () => { }); const results: [string, number, { size: number }][] = []; - await store.forEachKey(async (key, index, info) => { - results.push([key, index, info]); - }, { exclusiveStartKey: 'key0' }); + await store.forEachKey( + async (key, index, info) => { + results.push([key, index, info]); + }, + { exclusiveStartKey: 'key0' }, + ); expect(mockListKeys).toBeCalledTimes(3); expect(mockListKeys).toHaveBeenNthCalledWith(1, { exclusiveStartKey: 'key0' }); diff --git a/test/core/storages/request_queue.test.ts b/test/core/storages/request_queue.test.ts index 8ad854d9e114..a45bcff3aea5 100644 --- a/test/core/storages/request_queue.test.ts +++ b/test/core/storages/request_queue.test.ts @@ -37,9 +37,7 @@ describe('RequestQueue remote', () => { wasAlreadyHandled: false, wasAlreadyPresent: false, }; - const mockAddRequest = vitest - .spyOn(queue.client, 'addRequest') - .mockResolvedValueOnce(firstResolveValue); + const mockAddRequest = vitest.spyOn(queue.client, 'addRequest').mockResolvedValueOnce(firstResolveValue); const requestOptions = { url: 'http://example.com/a' }; const queueOperationInfo1 = await queue.addRequest(requestOptions); @@ -90,14 +88,21 @@ describe('RequestQueue remote', () => { expect(queue.inProgressCount()).toBe(1); // Test validations - await queue.markRequestHandled(new Request({ id: 'XXX', url: 'https://example.com' })) - .catch((err) => expect(err.message).toMatch(/Cannot mark request XXX as handled, because it is not in progress/)); - await queue.reclaimRequest(new Request({ id: 'XXX', url: 'https://example.com' })) + await queue + .markRequestHandled(new Request({ id: 'XXX', url: 'https://example.com' })) + .catch((err) => + expect(err.message).toMatch(/Cannot mark request XXX as handled, because it is not in progress/), + ); + await queue + .reclaimRequest(new Request({ id: 'XXX', url: 'https://example.com' })) .catch((err) => expect(err.message).toMatch(/Cannot reclaim request XXX, because it is not in progress/)); - await queue.addRequest(new Request({ id: 'id-already-set', url: 'https://example.com' })) - .catch((err) => expect(err.message).toMatch( - 'Expected property `id` to be of type `undefined` but received type `string` in object', - )); + await queue + .addRequest(new Request({ id: 'id-already-set', url: 'https://example.com' })) + .catch((err) => + expect(err.message).toMatch( + 'Expected property `id` to be of type `undefined` but received type `string` in object', + ), + ); // getRequest() returns null if object was not found. mockGetRequest.mockResolvedValueOnce(null); @@ -340,9 +345,7 @@ describe('RequestQueue remote', () => { // Query queue head with request A const listHeadMock = vitest.spyOn(queue.client, 'listHead'); listHeadMock.mockResolvedValueOnce({ - items: [ - { id: 'a', uniqueKey: 'aaa' }, - ], + items: [{ id: 'a', uniqueKey: 'aaa' }], } as never); expect(await queue.isEmpty()).toBe(false); @@ -403,9 +406,7 @@ describe('RequestQueue remote', () => { }); listHeadMock.mockResolvedValueOnce({ - items: [ - { id: 'a', uniqueKey: 'aaa' }, - ], + items: [{ id: 'a', uniqueKey: 'aaa' }], } as never); const fetchedRequest2 = await queue.fetchNextRequest(); @@ -646,9 +647,7 @@ describe('RequestQueue remote', () => { hadMultipleClients: false, }; - const getMock = vitest - .spyOn(queue.client, 'get') - .mockResolvedValueOnce(expected); + const getMock = vitest.spyOn(queue.client, 'get').mockResolvedValueOnce(expected); const result = await queue.getInfo(); expect(result).toEqual(expected); @@ -658,9 +657,7 @@ describe('RequestQueue remote', () => { test('drop() works', async () => { const queue = new RequestQueue({ id: 'some-id', name: 'some-name', client: storageClient }); - const deleteMock = vitest - .spyOn(queue.client, 'delete') - .mockResolvedValueOnce(undefined); + const deleteMock = vitest.spyOn(queue.client, 'delete').mockResolvedValueOnce(undefined); await queue.drop(); expect(deleteMock).toBeCalledTimes(1); @@ -717,15 +714,8 @@ describe('RequestQueue with requestsFromUrl', () => { test('should correctly load list from hosted files in correct order', async () => { const spy = vitest.spyOn(RequestQueue.prototype as any, '_downloadListOfUrls'); - const list1 = [ - 'https://example.com', - 'https://google.com', - 'https://wired.com', - ]; - const list2 = [ - 'https://another.com', - 'https://page.com', - ]; + const list1 = ['https://example.com', 'https://google.com', 'https://wired.com']; + const list2 = ['https://another.com', 'https://page.com']; spy.mockImplementationOnce(() => new Promise((resolve) => setTimeout(resolve(list1) as any, 100)) as any); spy.mockResolvedValueOnce(list2); @@ -767,11 +757,7 @@ describe('RequestQueue with requestsFromUrl', () => { }); test('should fix gdoc sharing url in `requestsFromUrl` automatically (GH issue #639)', async () => { - const list = [ - 'https://example.com', - 'https://google.com', - 'https://wired.com', - ]; + const list = ['https://example.com', 'https://google.com', 'https://wired.com']; const wrongUrls = [ 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU', 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/', @@ -780,7 +766,8 @@ describe('RequestQueue with requestsFromUrl', () => { 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/?q=blablabla', 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/edit#gid=0', ]; - const correctUrl = 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/gviz/tq?tqx=out:csv'; + const correctUrl = + 'https://docs.google.com/spreadsheets/d/11UGSBOSXy5Ov2WEP9nr4kSIxQJmH18zh-5onKtBsovU/gviz/tq?tqx=out:csv'; gotScrapingSpy.mockResolvedValue({ body: JSON.stringify(list) } as any); @@ -812,10 +799,7 @@ describe('RequestQueue with requestsFromUrl', () => { }); test('should use the defined proxy server when using `requestsFromUrl`', async () => { - const proxyUrls = [ - 'http://proxyurl.usedforthe.download', - 'http://another.proxy.url', - ]; + const proxyUrls = ['http://proxyurl.usedforthe.download', 'http://another.proxy.url']; const spy = vitest.spyOn(RequestQueue.prototype as any, '_downloadListOfUrls'); spy.mockResolvedValue([]); @@ -838,7 +822,7 @@ describe('RequestQueue with requestsFromUrl', () => { describe('RequestQueue v2', () => { const totalRequestsPerTest = 50; - function calculateHistogram(requests: { uniqueKey: string }[]) : number[] { + function calculateHistogram(requests: { uniqueKey: string }[]): number[] { const histogram: number[] = []; for (const item of requests) { const key = item.uniqueKey; @@ -856,7 +840,9 @@ describe('RequestQueue v2', () => { } function getUniqueRequests(count: number) { - return new Array(count).fill(0).map((_, i) => new Request({ url: `http://example.com/${i}`, uniqueKey: String(i) })); + return new Array(count) + .fill(0) + .map((_, i) => new Request({ url: `http://example.com/${i}`, uniqueKey: String(i) })); } test('listAndLockHead works as expected', async () => { @@ -877,11 +863,17 @@ describe('RequestQueue v2', () => { const queue = await getEmptyQueue('lock-timers'); await queue.addRequests(getUniqueRequests(totalRequestsPerTest)); - const { items: firstFetch } = await queue.client.listAndLockHead({ limit: totalRequestsPerTest / 2, lockSecs: 60 }); + const { items: firstFetch } = await queue.client.listAndLockHead({ + limit: totalRequestsPerTest / 2, + lockSecs: 60, + }); vitest.advanceTimersByTime(65000); - const { items: secondFetch } = await queue.client.listAndLockHead({ limit: totalRequestsPerTest / 2, lockSecs: 60 }); + const { items: secondFetch } = await queue.client.listAndLockHead({ + limit: totalRequestsPerTest / 2, + lockSecs: 60, + }); const histogram = calculateHistogram([...firstFetch, ...secondFetch]); expect(histogram).toEqual(Array(totalRequestsPerTest / 2).fill(2)); diff --git a/test/e2e/.eslintrc.json b/test/e2e/.eslintrc.json index a81cd4b2650c..ab289062e7d9 100644 --- a/test/e2e/.eslintrc.json +++ b/test/e2e/.eslintrc.json @@ -2,16 +2,12 @@ "root": true, "extends": "@apify/eslint-config-ts", "parserOptions": { - "project": null, + "project": null, "ecmaVersion": 2022 }, - "ignorePatterns": [ - "node_modules", - "dist", - "**/*.d.ts" - ], + "ignorePatterns": ["node_modules", "dist", "**/*.d.ts"], "rules": { - "@typescript-eslint/ban-ts-comment": 0, + "@typescript-eslint/ban-ts-comment": 0, "import/extensions": 0, "import/no-extraneous-dependencies": 0 } diff --git a/test/e2e/automatic-persist-value/actor/main.js b/test/e2e/automatic-persist-value/actor/main.js index cc9a0f6f566a..2d516ac5ca6b 100644 --- a/test/e2e/automatic-persist-value/actor/main.js +++ b/test/e2e/automatic-persist-value/actor/main.js @@ -3,7 +3,10 @@ import { BasicCrawler } from '@crawlee/basic'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/automatic-persist-value/test.mjs b/test/e2e/automatic-persist-value/test.mjs index b5bb21265411..329ac0574f80 100644 --- a/test/e2e/automatic-persist-value/test.mjs +++ b/test/e2e/automatic-persist-value/test.mjs @@ -16,4 +16,7 @@ await expect(item !== undefined, 'Key-value store auto-saved value is named "cra const parsed = JSON.parse(item.raw.toString()); await expect(typeof parsed === 'object' && parsed !== null, 'Key-value store auto-save value is a non-nullable object'); -await expect(parsed.crawlee === 'awesome!', 'Key-value store auto-save value has a property "crawlee" that is set to "awesome!"'); +await expect( + parsed.crawlee === 'awesome!', + 'Key-value store auto-save value has a property "crawlee" that is set to "awesome!"', +); diff --git a/test/e2e/autoscaling-max-tasks-per-minute/actor/main.js b/test/e2e/autoscaling-max-tasks-per-minute/actor/main.js index 2bd3e2bc1f9b..4b977736572a 100644 --- a/test/e2e/autoscaling-max-tasks-per-minute/actor/main.js +++ b/test/e2e/autoscaling-max-tasks-per-minute/actor/main.js @@ -8,7 +8,10 @@ const crawlerLogger = defaultLog.child({ const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; let crawlCalledAt = Date.now(); diff --git a/test/e2e/cheerio-default-ts/actor/.eslintrc.json b/test/e2e/cheerio-default-ts/actor/.eslintrc.json index 0c4c1ee86c95..337ad01eb73e 100644 --- a/test/e2e/cheerio-default-ts/actor/.eslintrc.json +++ b/test/e2e/cheerio-default-ts/actor/.eslintrc.json @@ -2,7 +2,7 @@ "root": true, "extends": "../../.eslintrc.json", "parserOptions": { - "project": "./test/e2e/cheerio-default-ts/actor/tsconfig.json", + "project": "./test/e2e/cheerio-default-ts/actor/tsconfig.json", "ecmaVersion": 2022 } } diff --git a/test/e2e/cheerio-default/actor/main.js b/test/e2e/cheerio-default/actor/main.js index 4144f74194ac..a1a8b9357111 100644 --- a/test/e2e/cheerio-default/actor/main.js +++ b/test/e2e/cheerio-default/actor/main.js @@ -3,13 +3,18 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new CheerioCrawler({ statusMessageCallback: async (ctx) => { - return ctx.crawler.setStatusMessage(`this is status message from ${new Date().toISOString()}`, { level: 'INFO' }); + return ctx.crawler.setStatusMessage(`this is status message from ${new Date().toISOString()}`, { + level: 'INFO', + }); }, statusMessageLoggingInterval: 1, async requestHandler({ $, enqueueLinks, request, log }) { diff --git a/test/e2e/cheerio-enqueue-links-base/actor/main.js b/test/e2e/cheerio-enqueue-links-base/actor/main.js index 514fa7934933..ac2ad0848a44 100644 --- a/test/e2e/cheerio-enqueue-links-base/actor/main.js +++ b/test/e2e/cheerio-enqueue-links-base/actor/main.js @@ -3,7 +3,10 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/cheerio-enqueue-links/actor/main.js b/test/e2e/cheerio-enqueue-links/actor/main.js index 8eea5b3f6c8a..2fdd1051af5c 100644 --- a/test/e2e/cheerio-enqueue-links/actor/main.js +++ b/test/e2e/cheerio-enqueue-links/actor/main.js @@ -4,7 +4,10 @@ import deepEqual from 'deep-equal'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/cheerio-ignore-ssl-errors/actor/main.js b/test/e2e/cheerio-ignore-ssl-errors/actor/main.js index 55d3d514591d..0686f7044124 100644 --- a/test/e2e/cheerio-ignore-ssl-errors/actor/main.js +++ b/test/e2e/cheerio-ignore-ssl-errors/actor/main.js @@ -3,14 +3,20 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new CheerioCrawler({ ignoreSslErrors: true, async requestHandler({ $, enqueueLinks, request, log }) { - const { url, userData: { label } } = request; + const { + url, + userData: { label }, + } = request; if (label === 'START') { log.info('Bad ssl page opened!'); diff --git a/test/e2e/cheerio-ignore-ssl-errors/test.mjs b/test/e2e/cheerio-ignore-ssl-errors/test.mjs index 7ca6b9b20597..235afc5f1717 100644 --- a/test/e2e/cheerio-ignore-ssl-errors/test.mjs +++ b/test/e2e/cheerio-ignore-ssl-errors/test.mjs @@ -7,7 +7,4 @@ const { stats, datasetItems } = await runActor(testActorDirname); await expect(stats.requestsFinished > 20, 'All requests finished'); await expect(datasetItems.length > 20, 'Minimum number of dataset items'); -await expect( - validateDataset(datasetItems, ['url', 'title']), - 'Dataset items validation', -); +await expect(validateDataset(datasetItems, ['url', 'title']), 'Dataset items validation'); diff --git a/test/e2e/cheerio-initial-cookies/actor/main.js b/test/e2e/cheerio-initial-cookies/actor/main.js index e58ac6243401..a1dcf6874477 100644 --- a/test/e2e/cheerio-initial-cookies/actor/main.js +++ b/test/e2e/cheerio-initial-cookies/actor/main.js @@ -26,25 +26,33 @@ const expectedCookies = [ const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new CheerioCrawler({ additionalMimeTypes: ['application/json'], - preNavigationHooks: [({ session, request }, gotOptions) => { - session.setCookies([ - { - name: 'session', - value: 'true', - }, - ], request.url); - request.headers.cookie = 'hook_request=true'; + preNavigationHooks: [ + ({ session, request }, gotOptions) => { + session.setCookies( + [ + { + name: 'session', + value: 'true', + }, + ], + request.url, + ); + request.headers.cookie = 'hook_request=true'; - gotOptions.headers ??= {}; - gotOptions.headers.Cookie = 'got_options_upper_case=true'; - gotOptions.headers.cookie = 'got_options_lower_case=true'; - }], + gotOptions.headers ??= {}; + gotOptions.headers.Cookie = 'got_options_upper_case=true'; + gotOptions.headers.cookie = 'got_options_lower_case=true'; + }, + ], async requestHandler({ json }) { const initialCookiesLength = expectedCookies.length; @@ -56,7 +64,11 @@ await Actor.main(async () => { let numberOfMatchingCookies = 0; for (const cookie of expectedCookies) { - if (pageCookies.some((pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value)) { + if ( + pageCookies.some( + (pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value, + ) + ) { numberOfMatchingCookies++; } } diff --git a/test/e2e/cheerio-initial-cookies/test.mjs b/test/e2e/cheerio-initial-cookies/test.mjs index 6987142a00e4..e09a30125dde 100644 --- a/test/e2e/cheerio-initial-cookies/test.mjs +++ b/test/e2e/cheerio-initial-cookies/test.mjs @@ -9,6 +9,6 @@ await expect(stats.requestsFinished === 1, 'All requests finished'); await expect(datasetItems[0].numberOfMatchingCookies === 5, 'Number of page cookies'); await expect( datasetItems[0].numberOfMatchingCookies === datasetItems[0].initialCookiesLength, - `Page cookies match the initial defined cookies. Number of non-matching cookies is ` - + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, + `Page cookies match the initial defined cookies. Number of non-matching cookies is ` + + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, ); diff --git a/test/e2e/cheerio-max-requests/actor/main.js b/test/e2e/cheerio-max-requests/actor/main.js index 7299c3997c92..0b0e27bd2bfa 100644 --- a/test/e2e/cheerio-max-requests/actor/main.js +++ b/test/e2e/cheerio-max-requests/actor/main.js @@ -3,7 +3,10 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { @@ -11,16 +14,23 @@ await Actor.main(async () => { maxRequestsPerCrawl: 10, autoscaledPoolOptions: { desiredConcurrency: 2 }, async requestHandler({ $, request }) { - const { url, userData: { label } } = request; + const { + url, + userData: { label }, + } = request; if (label === 'START') { - const links = $('a.card').toArray().map((item) => $(item).attr('href')); + const links = $('a.card') + .toArray() + .map((item) => $(item).attr('href')); for (const link of links) { const actorDetailUrl = `https://crawlee.dev${link}`; - await crawler.addRequests([{ - url: actorDetailUrl, - userData: { label: 'DETAIL' }, - }]); + await crawler.addRequests([ + { + url: actorDetailUrl, + userData: { label: 'DETAIL' }, + }, + ]); } } else if (label === 'DETAIL') { const uniqueIdentifier = url.split('/').slice(-2).join('/'); diff --git a/test/e2e/cheerio-page-info/actor/main.js b/test/e2e/cheerio-page-info/actor/main.js index dd8bbeef5777..3d4eaf06fe67 100644 --- a/test/e2e/cheerio-page-info/actor/main.js +++ b/test/e2e/cheerio-page-info/actor/main.js @@ -3,7 +3,10 @@ import { CheerioCrawler, createCheerioRouter, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; const router = createCheerioRouter(); diff --git a/test/e2e/cheerio-request-queue-v2/actor/main.js b/test/e2e/cheerio-request-queue-v2/actor/main.js index ee59ab9c54d9..fc5336197e69 100644 --- a/test/e2e/cheerio-request-queue-v2/actor/main.js +++ b/test/e2e/cheerio-request-queue-v2/actor/main.js @@ -3,7 +3,10 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js b/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js index be3d34bead74..9cb577437158 100644 --- a/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js +++ b/test/e2e/cheerio-throw-on-ssl-errors/actor/main.js @@ -3,14 +3,19 @@ import { CheerioCrawler, Dataset } from '@crawlee/cheerio'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new CheerioCrawler({ ignoreSslErrors: false, async requestHandler({ $, enqueueLinks, request, log }) { - const { userData: { label } } = request; + const { + userData: { label }, + } = request; if (label === 'START') { log.info('Bad ssl page opened!'); diff --git a/test/e2e/input-json5/actor/main.js b/test/e2e/input-json5/actor/main.js index bd137ca0c34e..b554cf7c47dc 100644 --- a/test/e2e/input-json5/actor/main.js +++ b/test/e2e/input-json5/actor/main.js @@ -2,7 +2,10 @@ import { Actor, Dataset, KeyValueStore, log } from 'apify'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/input-json5/test.mjs b/test/e2e/input-json5/test.mjs index 6a88d2d48f13..b2444904b5d4 100644 --- a/test/e2e/input-json5/test.mjs +++ b/test/e2e/input-json5/test.mjs @@ -10,8 +10,12 @@ await initialize(testActorDirname); const { datasetItems } = await runActor(testActorDirname); await expect(datasetItems.length === 1, 'Number of dataset items'); -await expect(JSON.stringify(datasetItems) === JSON.stringify([ - { - hello: 'world', - }, -]), 'Dataset items validation'); +await expect( + JSON.stringify(datasetItems) === + JSON.stringify([ + { + hello: 'world', + }, + ]), + 'Dataset items validation', +); diff --git a/test/e2e/jsdom-default-ts/actor/.eslintrc.json b/test/e2e/jsdom-default-ts/actor/.eslintrc.json index 7e54966d8bab..2e68e8d4d57f 100644 --- a/test/e2e/jsdom-default-ts/actor/.eslintrc.json +++ b/test/e2e/jsdom-default-ts/actor/.eslintrc.json @@ -2,7 +2,7 @@ "root": true, "extends": "../../.eslintrc.json", "parserOptions": { - "project": "./test/e2e/jsdom-default-ts/actor/tsconfig.json", + "project": "./test/e2e/jsdom-default-ts/actor/tsconfig.json", "ecmaVersion": 2022 } } diff --git a/test/e2e/jsdom-react-ts/actor/.eslintrc.json b/test/e2e/jsdom-react-ts/actor/.eslintrc.json index bc134bc2c21e..06b4e5962341 100644 --- a/test/e2e/jsdom-react-ts/actor/.eslintrc.json +++ b/test/e2e/jsdom-react-ts/actor/.eslintrc.json @@ -2,7 +2,7 @@ "root": true, "extends": "../../.eslintrc.json", "parserOptions": { - "project": "./test/e2e/jsdom-react-ts/actor/tsconfig.json", + "project": "./test/e2e/jsdom-react-ts/actor/tsconfig.json", "ecmaVersion": 2022 } } diff --git a/test/e2e/jsdom-react-ts/actor/main.ts b/test/e2e/jsdom-react-ts/actor/main.ts index 2006a0ea58cd..8cd788c1fb81 100644 --- a/test/e2e/jsdom-react-ts/actor/main.ts +++ b/test/e2e/jsdom-react-ts/actor/main.ts @@ -24,8 +24,6 @@ const crawler = new JSDOMCrawler({ }, }); -await crawler.run([ - 'https://ahfarmer.github.io/calculator/', -]); +await crawler.run(['https://ahfarmer.github.io/calculator/']); await Actor.exit({ exit: Actor.isAtHome() }); diff --git a/test/e2e/linkedom-default-ts/actor/.eslintrc.json b/test/e2e/linkedom-default-ts/actor/.eslintrc.json index d952e0677d4f..05856d29ddad 100644 --- a/test/e2e/linkedom-default-ts/actor/.eslintrc.json +++ b/test/e2e/linkedom-default-ts/actor/.eslintrc.json @@ -2,7 +2,7 @@ "root": true, "extends": "../../.eslintrc.json", "parserOptions": { - "project": "./test/e2e/linkedom-default-ts/actor/tsconfig.json", + "project": "./test/e2e/linkedom-default-ts/actor/tsconfig.json", "ecmaVersion": 2022 } } diff --git a/test/e2e/migration/actor/main.js b/test/e2e/migration/actor/main.js index 709e63a25ac7..b77d46e74f2f 100644 --- a/test/e2e/migration/actor/main.js +++ b/test/e2e/migration/actor/main.js @@ -8,7 +8,10 @@ process.env.CRAWLEE_PURGE_ON_START = '0'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; const thisFile = new URL(import.meta.url); diff --git a/test/e2e/playwright-chromium-experimental-containers/actor/main.js b/test/e2e/playwright-chromium-experimental-containers/actor/main.js index 197be73cdf4a..887cbb744956 100644 --- a/test/e2e/playwright-chromium-experimental-containers/actor/main.js +++ b/test/e2e/playwright-chromium-experimental-containers/actor/main.js @@ -6,7 +6,10 @@ process.exit(404); const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { @@ -15,9 +18,11 @@ await Actor.main(async () => { launchContext: { experimentalContainers: true, }, - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = 'networkidle'; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = 'networkidle'; + }, + ], async requestHandler({ page }) { const content = await page.content(); await Dataset.pushData({ ip: content.match(/"clientIp":\s*"(.*)"/)?.[1] }); diff --git a/test/e2e/playwright-default/actor/main.js b/test/e2e/playwright-default/actor/main.js index 5ef039f1d871..dc10bacf8f9b 100644 --- a/test/e2e/playwright-default/actor/main.js +++ b/test/e2e/playwright-default/actor/main.js @@ -3,14 +3,19 @@ import { Dataset, PlaywrightCrawler } from '@crawlee/playwright'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PlaywrightCrawler({ - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = 'networkidle'; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = 'networkidle'; + }, + ], async requestHandler({ page, enqueueLinks, request }) { const { url } = request; const pageTitle = await page.title(); diff --git a/test/e2e/playwright-enqueue-links-base/actor/main.js b/test/e2e/playwright-enqueue-links-base/actor/main.js index 9db543080150..0a2c7a319c3a 100644 --- a/test/e2e/playwright-enqueue-links-base/actor/main.js +++ b/test/e2e/playwright-enqueue-links-base/actor/main.js @@ -3,7 +3,10 @@ import { PlaywrightCrawler, Dataset } from '@crawlee/playwright'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/playwright-enqueue-links/actor/main.js b/test/e2e/playwright-enqueue-links/actor/main.js index 66d8a2d0747f..4bcd12de1574 100644 --- a/test/e2e/playwright-enqueue-links/actor/main.js +++ b/test/e2e/playwright-enqueue-links/actor/main.js @@ -5,7 +5,10 @@ process.env.APIFY_LOG_LEVEL = 'DEBUG'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/playwright-firefox-experimental-containers/actor/main.js b/test/e2e/playwright-firefox-experimental-containers/actor/main.js index 2cf16259d1ea..a07251a8036d 100644 --- a/test/e2e/playwright-firefox-experimental-containers/actor/main.js +++ b/test/e2e/playwright-firefox-experimental-containers/actor/main.js @@ -7,7 +7,10 @@ process.exit(404); const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { @@ -17,9 +20,11 @@ await Actor.main(async () => { launcher: playwright.firefox, experimentalContainers: true, }, - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = 'networkidle'; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = 'networkidle'; + }, + ], async requestHandler({ page }) { const content = await page.content(); await Dataset.pushData({ ip: content.match(/"clientIp":\s*"(.*)"/)?.[1] }); diff --git a/test/e2e/playwright-initial-cookies/actor/main.js b/test/e2e/playwright-initial-cookies/actor/main.js index 815aba135a13..0c1f2b05b027 100644 --- a/test/e2e/playwright-initial-cookies/actor/main.js +++ b/test/e2e/playwright-initial-cookies/actor/main.js @@ -18,22 +18,30 @@ const expectedCookies = [ const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PlaywrightCrawler({ - preNavigationHooks: [({ session, request }, goToOptions) => { - session.setCookies([ - { - name: 'session', - value: 'true', - }, - ], request.url); - request.headers.cookie = 'hook_request=true'; - - goToOptions.waitUntil = 'networkidle'; - }], + preNavigationHooks: [ + ({ session, request }, goToOptions) => { + session.setCookies( + [ + { + name: 'session', + value: 'true', + }, + ], + request.url, + ); + request.headers.cookie = 'hook_request=true'; + + goToOptions.waitUntil = 'networkidle'; + }, + ], async requestHandler({ page }) { const initialCookiesLength = expectedCookies.length; @@ -41,7 +49,11 @@ await Actor.main(async () => { let numberOfMatchingCookies = 0; for (const cookie of expectedCookies) { - if (pageCookies.some((pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value)) { + if ( + pageCookies.some( + (pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value, + ) + ) { numberOfMatchingCookies++; } } diff --git a/test/e2e/playwright-initial-cookies/test.mjs b/test/e2e/playwright-initial-cookies/test.mjs index a93bcf543052..a24cd3a3ef0e 100644 --- a/test/e2e/playwright-initial-cookies/test.mjs +++ b/test/e2e/playwright-initial-cookies/test.mjs @@ -9,6 +9,6 @@ await expect(stats.requestsFinished === 1, 'All requests finished'); await expect(datasetItems[0].numberOfMatchingCookies === 3, 'Number of page cookies'); await expect( datasetItems[0].numberOfMatchingCookies === datasetItems[0].initialCookiesLength, - `Page cookies match the initial defined cookies. Number of non-matching cookies is ` - + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, + `Page cookies match the initial defined cookies. Number of non-matching cookies is ` + + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, ); diff --git a/test/e2e/playwright-introduction-guide/actor/main.js b/test/e2e/playwright-introduction-guide/actor/main.js index 66c5ce498627..5d232eed6614 100644 --- a/test/e2e/playwright-introduction-guide/actor/main.js +++ b/test/e2e/playwright-introduction-guide/actor/main.js @@ -1,7 +1,12 @@ import { Actor } from 'apify'; import { Dataset, createPlaywrightRouter, PlaywrightCrawler } from '@crawlee/playwright'; -await Actor.init({ storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined }); +await Actor.init({ + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, +}); // createPlaywrightRouter() is only a helper to get better // intellisense and typings. You can use Router.create() too. @@ -13,9 +18,7 @@ router.addHandler('DETAIL', async ({ request, page, log }) => { const manufacturer = urlPart[0].split('-')[0]; // 'sennheiser' const title = await page.locator('.product-meta h1').textContent(); - const sku = await page - .locator('span.product-meta__sku-number') - .textContent(); + const sku = await page.locator('span.product-meta__sku-number').textContent(); const priceElement = page .locator('span.price') diff --git a/test/e2e/proxy-rotation/actor/main.js b/test/e2e/proxy-rotation/actor/main.js index 00741af34972..db66b0ad1cb6 100644 --- a/test/e2e/proxy-rotation/actor/main.js +++ b/test/e2e/proxy-rotation/actor/main.js @@ -3,7 +3,10 @@ import { Dataset, KeyValueStore, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { @@ -24,8 +27,7 @@ await Actor.main(async () => { }, }); - await crawler.run(Array.from( - { length: 5 }, - (_, i) => ({ url: 'https://api.apify.com/v2/browser-info', uniqueKey: `${i}` }), - )); + await crawler.run( + Array.from({ length: 5 }, (_, i) => ({ url: 'https://api.apify.com/v2/browser-info', uniqueKey: `${i}` })), + ); }, mainOptions); diff --git a/test/e2e/puppeteer-default/actor/main.js b/test/e2e/puppeteer-default/actor/main.js index 2a4badbcb46c..fde7a900ab50 100644 --- a/test/e2e/puppeteer-default/actor/main.js +++ b/test/e2e/puppeteer-default/actor/main.js @@ -3,14 +3,19 @@ import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page, enqueueLinks, request, infiniteScroll }) { await infiniteScroll(); const { url } = request; diff --git a/test/e2e/puppeteer-enqueue-links/actor/main.js b/test/e2e/puppeteer-enqueue-links/actor/main.js index 63fbdfda4069..63bf0424ac75 100644 --- a/test/e2e/puppeteer-enqueue-links/actor/main.js +++ b/test/e2e/puppeteer-enqueue-links/actor/main.js @@ -4,7 +4,10 @@ import deepEqual from 'deep-equal'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js b/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js index f0e677dd1e81..1b6f54498f84 100644 --- a/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js +++ b/test/e2e/puppeteer-ignore-ssl-errors/actor/main.js @@ -3,17 +3,25 @@ import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ launchContext: { launchOptions: { ignoreHTTPSErrors: true } }, - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page, enqueueLinks, request, log }) { - const { url, userData: { label } } = request; + const { + url, + userData: { label }, + } = request; if (label === 'START') { log.info('Bad ssl page opened!'); diff --git a/test/e2e/puppeteer-initial-cookies/actor/main.js b/test/e2e/puppeteer-initial-cookies/actor/main.js index fa5c423b7b02..89ddc231990b 100644 --- a/test/e2e/puppeteer-initial-cookies/actor/main.js +++ b/test/e2e/puppeteer-initial-cookies/actor/main.js @@ -18,22 +18,30 @@ const expectedCookies = [ const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ - preNavigationHooks: [({ session, request }, goToOptions) => { - session.setCookies([ - { - name: 'session', - value: 'true', - }, - ], request.url); - request.headers.cookie = 'hook_request=true'; - - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + ({ session, request }, goToOptions) => { + session.setCookies( + [ + { + name: 'session', + value: 'true', + }, + ], + request.url, + ); + request.headers.cookie = 'hook_request=true'; + + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page }) { const initialCookiesLength = expectedCookies.length; @@ -41,7 +49,11 @@ await Actor.main(async () => { let numberOfMatchingCookies = 0; for (const cookie of expectedCookies) { - if (pageCookies.some((pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value)) { + if ( + pageCookies.some( + (pageCookie) => pageCookie.name === cookie.name && pageCookie.value === cookie.value, + ) + ) { numberOfMatchingCookies++; } } diff --git a/test/e2e/puppeteer-initial-cookies/test.mjs b/test/e2e/puppeteer-initial-cookies/test.mjs index a93bcf543052..a24cd3a3ef0e 100644 --- a/test/e2e/puppeteer-initial-cookies/test.mjs +++ b/test/e2e/puppeteer-initial-cookies/test.mjs @@ -9,6 +9,6 @@ await expect(stats.requestsFinished === 1, 'All requests finished'); await expect(datasetItems[0].numberOfMatchingCookies === 3, 'Number of page cookies'); await expect( datasetItems[0].numberOfMatchingCookies === datasetItems[0].initialCookiesLength, - `Page cookies match the initial defined cookies. Number of non-matching cookies is ` - + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, + `Page cookies match the initial defined cookies. Number of non-matching cookies is ` + + `${datasetItems[0].initialCookiesLength - datasetItems[0].numberOfMatchingCookies}`, ); diff --git a/test/e2e/puppeteer-page-info/actor/main.js b/test/e2e/puppeteer-page-info/actor/main.js index 88cad2b45ce0..95ca1f9715f1 100644 --- a/test/e2e/puppeteer-page-info/actor/main.js +++ b/test/e2e/puppeteer-page-info/actor/main.js @@ -3,20 +3,28 @@ import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page, enqueueLinks, request }) { - const { userData: { label } } = request; + const { + userData: { label }, + } = request; if (label === 'START') { await enqueueLinks({ - globs: ['**/examples/accept-user-input'], userData: { label: 'DETAIL' }, + globs: ['**/examples/accept-user-input'], + userData: { label: 'DETAIL' }, }); } @@ -25,18 +33,10 @@ await Actor.main(async () => { const uniqueIdentifier = url.split('/').slice(-2).join('/'); - const titleP = page.$eval('header h1', ((el) => el.textContent)); - const firstParagraphP = page.$eval('header + p', ((el) => el.textContent)); + const titleP = page.$eval('header h1', (el) => el.textContent); + const firstParagraphP = page.$eval('header + p', (el) => el.textContent); const modifiedDateP = page.$eval('.theme-last-updated time', (el) => el.getAttribute('datetime')); - const [ - title, - description, - modifiedDate, - ] = await Promise.all([ - titleP, - firstParagraphP, - modifiedDateP, - ]); + const [title, description, modifiedDate] = await Promise.all([titleP, firstParagraphP, modifiedDateP]); await Dataset.pushData({ url, uniqueIdentifier, title, description, modifiedDate }); } diff --git a/test/e2e/puppeteer-store-pagination-jquery/actor/main.js b/test/e2e/puppeteer-store-pagination-jquery/actor/main.js index 288579abe12c..3ceaf9db9fab 100644 --- a/test/e2e/puppeteer-store-pagination-jquery/actor/main.js +++ b/test/e2e/puppeteer-store-pagination-jquery/actor/main.js @@ -3,20 +3,28 @@ import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ maxRequestsPerCrawl: 10, - preNavigationHooks: [async ({ page }, goToOptions) => { - await page.evaluateOnNewDocument(() => { - localStorage.setItem('themeExitPopup', 'true'); - }); - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + async ({ page }, goToOptions) => { + await page.evaluateOnNewDocument(() => { + localStorage.setItem('themeExitPopup', 'true'); + }); + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page, request, log, enqueueLinks, injectJQuery }) { - const { url, userData: { label } } = request; + const { + url, + userData: { label }, + } = request; if (label === 'START') { log.info('Store opened'); @@ -51,10 +59,10 @@ await Actor.main(async () => { const price = Number(rawPrice.replaceAll(',', '')); - const inStock = $('span.product-form__inventory') - .first() - .filter((_, el) => $(el).text().includes('In stock')) - .length !== 0; + const inStock = + $('span.product-form__inventory') + .first() + .filter((_, el) => $(el).text().includes('In stock')).length !== 0; return { title: $('.product-meta h1').text(), @@ -71,5 +79,7 @@ await Actor.main(async () => { }, }); - await crawler.run([{ url: 'https://warehouse-theme-metal.myshopify.com/collections/all-tvs', userData: { label: 'START' } }]); + await crawler.run([ + { url: 'https://warehouse-theme-metal.myshopify.com/collections/all-tvs', userData: { label: 'START' } }, + ]); }, mainOptions); diff --git a/test/e2e/puppeteer-store-pagination/actor/main.js b/test/e2e/puppeteer-store-pagination/actor/main.js index 2aaaf7236325..086d7bb784bb 100644 --- a/test/e2e/puppeteer-store-pagination/actor/main.js +++ b/test/e2e/puppeteer-store-pagination/actor/main.js @@ -1,16 +1,23 @@ import { Actor } from 'apify'; import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; -await Actor.init({ storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined }); +await Actor.init({ + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, +}); const crawler = new PuppeteerCrawler({ maxRequestsPerCrawl: 10, - preNavigationHooks: [async ({ page }, goToOptions) => { - await page.evaluateOnNewDocument(() => { - localStorage.setItem('themeExitPopup', 'true'); - }); - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + async ({ page }, goToOptions) => { + await page.evaluateOnNewDocument(() => { + localStorage.setItem('themeExitPopup', 'true'); + }); + goToOptions.waitUntil = ['networkidle2']; + }, + ], }); crawler.router.addHandler('START', async ({ log, enqueueLinks, page }) => { @@ -38,8 +45,14 @@ crawler.router.addHandler('DETAIL', async ({ log, page, request: { url } }) => { const urlPart = url.split('/').slice(-1); // ['sennheiser-mke-440-professional-stereo-shotgun-microphone-mke-440'] const manufacturer = urlPart[0].split('-')[0]; // 'sennheiser' - const title = await page.locator('.product-meta h1').map((el) => el.textContent).wait(); - const sku = await page.locator('span.product-meta__sku-number').map((el) => el.textContent).wait(); + const title = await page + .locator('.product-meta h1') + .map((el) => el.textContent) + .wait(); + const sku = await page + .locator('span.product-meta__sku-number') + .map((el) => el.textContent) + .wait(); const rawPriceString = await page .locator('span.price') @@ -53,7 +66,7 @@ crawler.router.addHandler('DETAIL', async ({ log, page, request: { url } }) => { const inStock = await page .locator('span.product-form__inventory') .filter((el) => el.textContent.includes('In stock')) - .map((el) => (!!el)) + .map((el) => !!el) .wait(); const results = { @@ -68,6 +81,8 @@ crawler.router.addHandler('DETAIL', async ({ log, page, request: { url } }) => { await Dataset.pushData(results); }); -await crawler.run([{ url: 'https://warehouse-theme-metal.myshopify.com/collections/all-tvs', userData: { label: 'START' } }]); +await crawler.run([ + { url: 'https://warehouse-theme-metal.myshopify.com/collections/all-tvs', userData: { label: 'START' } }, +]); await Actor.exit({ exit: Actor.isAtHome() }); diff --git a/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js b/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js index 48fc71dd9739..49b739463b4d 100644 --- a/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js +++ b/test/e2e/puppeteer-throw-on-ssl-errors/actor/main.js @@ -3,17 +3,25 @@ import { Dataset, PuppeteerCrawler } from '@crawlee/puppeteer'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { const crawler = new PuppeteerCrawler({ launchContext: { launchOptions: { ignoreHTTPSErrors: false } }, // This is the default - preNavigationHooks: [(_ctx, goToOptions) => { - goToOptions.waitUntil = ['networkidle2']; - }], + preNavigationHooks: [ + (_ctx, goToOptions) => { + goToOptions.waitUntil = ['networkidle2']; + }, + ], async requestHandler({ page, enqueueLinks, request, log }) { - const { url, userData: { label } } = request; + const { + url, + userData: { label }, + } = request; if (label === 'START') { log.info('Bad ssl page opened!'); diff --git a/test/e2e/request-queue-zero-concurrency/actor/main.js b/test/e2e/request-queue-zero-concurrency/actor/main.js index 977970f4b675..fe48e2bce6e5 100644 --- a/test/e2e/request-queue-zero-concurrency/actor/main.js +++ b/test/e2e/request-queue-zero-concurrency/actor/main.js @@ -7,7 +7,10 @@ process.env.CRAWLEE_INTERNAL_TIMEOUT = '30000'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; // RequestQueue auto-reset when stuck with requests in progress diff --git a/test/e2e/request-skip-navigation/actor/main.js b/test/e2e/request-skip-navigation/actor/main.js index ef002ecfa69e..e34dfc2c9242 100644 --- a/test/e2e/request-skip-navigation/actor/main.js +++ b/test/e2e/request-skip-navigation/actor/main.js @@ -21,7 +21,10 @@ const r3 = new Request({ const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; // Persisting internal settings of `Request`. @@ -30,7 +33,11 @@ await Actor.main(async () => { let navigationCounter = 0; const crawler = new CheerioCrawler({ - preNavigationHooks: [() => { navigationCounter++; }], + preNavigationHooks: [ + () => { + navigationCounter++; + }, + ], async requestHandler({ request }) { requestCounter++; if (request.skipNavigation) { diff --git a/test/e2e/run.mjs b/test/e2e/run.mjs index 8a9b5dc1c69f..c63bbfb25928 100644 --- a/test/e2e/run.mjs +++ b/test/e2e/run.mjs @@ -74,12 +74,19 @@ async function run() { } if (!seenFirst) { - console.log(`${colors.red('[fatal]')} test ${colors.yellow(`[${dir.name}]`)} did not call "initialize(import.meta.url)"!`); + console.log( + `${colors.red('[fatal]')} test ${colors.yellow( + `[${dir.name}]`, + )} did not call "initialize(import.meta.url)"!`, + ); worker.terminate(); return; } - if (process.env.STORAGE_IMPLEMENTATION === 'PLATFORM' && (str.startsWith('[build]') || str.startsWith('[run]') || str.startsWith('[kv]'))) { + if ( + process.env.STORAGE_IMPLEMENTATION === 'PLATFORM' && + (str.startsWith('[build]') || str.startsWith('[run]') || str.startsWith('[kv]')) + ) { const platformStatsMessage = str.match(/\[(?:run|build|kv)] (.*)/); if (platformStatsMessage) { console.log(`${colors.yellow(`[${dir.name}] `)}${colors.grey(platformStatsMessage[1])}`); @@ -109,7 +116,11 @@ async function run() { const took = (Date.now() - now) / 1000; const status = code === 0 ? 'success' : 'failure'; const color = code === 0 ? 'green' : 'red'; - console.log(`${colors.yellow(`[${dir.name}] `)}${colors[color](`Test finished with status: ${status} `)}${colors.grey(`[took ${took}s]`)}`); + console.log( + `${colors.yellow(`[${dir.name}] `)}${colors[color]( + `Test finished with status: ${status} `, + )}${colors.grey(`[took ${took}s]`)}`, + ); if (['MEMORY', 'LOCAL'].includes(process.env.STORAGE_IMPLEMENTATION)) { await clearStorage(`${basePath}/${dir.name}`); diff --git a/test/e2e/session-rotation/actor/main.js b/test/e2e/session-rotation/actor/main.js index 58d1a19b9f52..c9b3ba5e0e1e 100644 --- a/test/e2e/session-rotation/actor/main.js +++ b/test/e2e/session-rotation/actor/main.js @@ -3,7 +3,10 @@ import { PlaywrightCrawler } from '@crawlee/playwright'; const mainOptions = { exit: Actor.isAtHome(), - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }; await Actor.main(async () => { diff --git a/test/e2e/session-rotation/test.mjs b/test/e2e/session-rotation/test.mjs index 5ffd7300b384..5ff4a618c8b4 100644 --- a/test/e2e/session-rotation/test.mjs +++ b/test/e2e/session-rotation/test.mjs @@ -7,6 +7,6 @@ const { datasetItems } = await runActor(testActorDirname, 4096); await expect(datasetItems.length === 11, 'Retried correct number of times'); await expect( - datasetItems.map( - (session) => datasetItems.filter((s) => s.id === session.id), - ).every((x) => x.length <= 2), 'No session used more than three times'); + datasetItems.map((session) => datasetItems.filter((s) => s.id === session.id)).every((x) => x.length <= 2), + 'No session used more than three times', +); diff --git a/test/e2e/tools.mjs b/test/e2e/tools.mjs index ae18eda80da1..927238012138 100644 --- a/test/e2e/tools.mjs +++ b/test/e2e/tools.mjs @@ -84,7 +84,9 @@ export async function runActor(dirName, memory = 4096) { try { execSync('npx -y apify-cli@beta push --no-prompt', { cwd: dirName }); } catch (err) { - console.error(colors.red(`Failed to push actor to the Apify platform. (signal ${colors.yellow(err.signal)})`)); + console.error( + colors.red(`Failed to push actor to the Apify platform. (signal ${colors.yellow(err.signal)})`), + ); if (err.stdout) { console.log(colors.grey(` STDOUT: `), err.stdout); @@ -119,7 +121,9 @@ export async function runActor(dirName, memory = 4096) { let runId; try { - const { data: { id: foundRunId } } = await gotClient(`https://api.apify.com/v2/acts/${id}/runs`, { + const { + data: { id: foundRunId }, + } = await gotClient(`https://api.apify.com/v2/acts/${id}/runs`, { method: 'POST', searchParams: { memory, @@ -137,7 +141,9 @@ export async function runActor(dirName, memory = 4096) { runId = foundRunId; } catch (err) { - console.error(colors.red(`Failed to start actor run on the Apify platform. (code ${colors.yellow(err.code)})`)); + console.error( + colors.red(`Failed to start actor run on the Apify platform. (code ${colors.yellow(err.code)})`), + ); if (err.response) { console.log(colors.grey(` RESPONSE: `), err.response.body || err.response.rawBody?.toString('utf-8')); @@ -166,14 +172,16 @@ export async function runActor(dirName, memory = 4096) { console.log(`[kv] View storage: https://console.apify.com/storage/key-value/${kvResult.id}`); } - const entries = await Promise.all(keyValueItems.map(async ({ key }) => { - const record = await client.keyValueStore(kvResult.id).getRecord(key, { buffer: true }); + const entries = await Promise.all( + keyValueItems.map(async ({ key }) => { + const record = await client.keyValueStore(kvResult.id).getRecord(key, { buffer: true }); - return { - name: record.key, - raw: record.value, - }; - })); + return { + name: record.key, + raw: record.value, + }; + }), + ); return entries.filter(({ name }) => !isPrivateEntry(name)); } @@ -181,10 +189,7 @@ export async function runActor(dirName, memory = 4096) { return undefined; }; - const { - startedAt: buildStartedAt, - finishedAt: buildFinishedAt, - } = await client.build(buildId).get(); + const { startedAt: buildStartedAt, finishedAt: buildFinishedAt } = await client.build(buildId).get(); const buildTook = (buildFinishedAt.getTime() - buildStartedAt.getTime()) / 1000; console.log(`[build] View build log: https://api.apify.com/v2/logs/${buildId} [build took ${buildTook}s]`); @@ -217,7 +222,10 @@ export async function runActor(dirName, memory = 4096) { if (input) { await Actor.init({ // @ts-ignore installed only optionally run `run.mjs` script - storage: process.env.STORAGE_IMPLEMENTATION === 'LOCAL' ? new (await import('@apify/storage-local')).ApifyStorageLocal() : undefined, + storage: + process.env.STORAGE_IMPLEMENTATION === 'LOCAL' + ? new (await import('@apify/storage-local')).ApifyStorageLocal() + : undefined, }); await Actor.setValue('INPUT', input, { contentType }); } @@ -310,7 +318,9 @@ export async function getApifyToken() { const authPath = join(homedir(), '.apify', 'auth.json'); if (!existsSync(authPath)) { - throw new Error('You need to be logged in with your Apify account to run E2E tests. Call "apify login" to fix that.'); + throw new Error( + 'You need to be logged in with your Apify account to run E2E tests. Call "apify login" to fix that.', + ); } const { token } = await fs.readJSON(authPath); diff --git a/test/shared/_helper.ts b/test/shared/_helper.ts index 9a2bfc1b84dd..8e77fc5773ff 100644 --- a/test/shared/_helper.ts +++ b/test/shared/_helper.ts @@ -16,13 +16,14 @@ export const startExpressAppPromise = async (app: Application, port: number) => export const responseSamples = { json: { foo: 'bar' }, - xml: '\n' - + '\n' - + '\n' - + ' https://apify.com\n' - + ' Web Scraping, Data Extraction and Automation · Apify\n' - + '\n' - + '', + xml: + '\n' + + '\n' + + '\n' + + ' https://apify.com\n' + + ' Web Scraping, Data Extraction and Automation · Apify\n' + + '\n' + + '', complexXml: fs.readFileSync(path.join(__dirname, 'data/complex.xml'), 'utf-8'), image: fs.readFileSync(path.join(__dirname, 'data/apify.png')), html: ` @@ -176,9 +177,11 @@ console.log('Hello world!'); export async function runExampleComServer(): Promise<[Server, number]> { const app = express(); - app.use(bodyParser.urlencoded({ - extended: true, - })); + app.use( + bodyParser.urlencoded({ + extended: true, + }), + ); app.use(bodyParser.json()); const special = express.Router(); @@ -219,9 +222,7 @@ export async function runExampleComServer(): Promise<[Server, number]> { }); special.post('/jsonError', (_req, res) => { - res - .status(500) - .json({ message: 'CUSTOM_ERROR' }); + res.status(500).json({ message: 'CUSTOM_ERROR' }); }); special.get('/mirror', (_req, res) => { @@ -265,10 +266,7 @@ export async function runExampleComServer(): Promise<[Server, number]> { }); special.get('/cloudflareBlocking', async (_req, res) => { - res - .type('html') - .status(403) - .send(responseSamples.cloudflareBlocking); + res.type('html').status(403).send(responseSamples.cloudflareBlocking); }); })(); diff --git a/test/shared/data/html_to_text_test_data.ts b/test/shared/data/html_to_text_test_data.ts index bbe18af5a6e4..c7832f97e380 100644 --- a/test/shared/data/html_to_text_test_data.ts +++ b/test/shared/data/html_to_text_test_data.ts @@ -4,11 +4,12 @@ // We're keeping this text as a JS string, because git and other // tools do magic with line endings and it can break tests. // E.g. LF -> CRLF on Win or auto-trimming of lines in editors. -export const text = 'Let\'s start with a simple text. \n' + - 'The ships hung in the sky, much the way that bricks don\'t. \n' + - 'These aren\'t the Droids you\'re looking for\n' + - 'I\'m sorry, Dave. I\'m afraid I can\'t do that.\n' + - 'I\'m sorry, Dave. I\'m afraid I can\'t do that.\n' + +export const text = + "Let's start with a simple text. \n" + + "The ships hung in the sky, much the way that bricks don't. \n" + + "These aren't the Droids you're looking for\n" + + "I'm sorry, Dave. I'm afraid I can't do that.\n" + + "I'm sorry, Dave. I'm afraid I can't do that.\n" + 'A1\tA2\tA3\t\n' + 'B1\tB2\tB3\tB 4\t\n' + 'This is some text with inline elements and HTML entities (>bla<) \n' + @@ -23,7 +24,7 @@ export const text = 'Let\'s start with a simple text. \n' + ' block should be kept\n' + ' pre-formatted.\n' + 'The Greatest Science Fiction Quotes Of All Time \n' + - 'Don\'t know, I don\'t know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes.' + "Don't know, I don't know such stuff. I just do eyes, ju-, ju-, just eyes... just genetic design, just eyes. You Nexus, huh? I design your eyes."; export const html = ` @@ -119,4 +120,4 @@ But, just eyes. You Nexus, huh? I design your eyes.

-` +`; diff --git a/test/utils/cheerio.test.ts b/test/utils/cheerio.test.ts index 7ec79857f953..a3d7dc0c404d 100644 --- a/test/utils/cheerio.test.ts +++ b/test/utils/cheerio.test.ts @@ -53,7 +53,10 @@ describe('htmlToText()', () => { checkHtmlToText('

Header 1


Header 2




', 'Header 1\n\nHeader 2'); checkHtmlToText('

Header 1

\n
\n

Header 2




', 'Header 1\n\nHeader 2'); checkHtmlToText('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\nHeader 2'); - checkHtmlToText('

Header 1

\n
\n

Header 2




', 'Header 1\n\n\n\nHeader 2'); + checkHtmlToText( + '

Header 1

\n
\n

Header 2




', + 'Header 1\n\n\n\nHeader 2', + ); checkHtmlToText('
Div

Paragraph

', 'Div\nParagraph'); checkHtmlToText('
Div1
Div2
', 'Div1\nDiv2'); diff --git a/test/utils/extract-urls.test.ts b/test/utils/extract-urls.test.ts index 234f2f36c11a..a9c634947bba 100644 --- a/test/utils/extract-urls.test.ts +++ b/test/utils/extract-urls.test.ts @@ -1,11 +1,7 @@ import fs from 'node:fs'; import path from 'node:path'; -import { - downloadListOfUrls, - extractUrls, - URL_WITH_COMMAS_REGEX, -} from '@crawlee/utils'; +import { downloadListOfUrls, extractUrls, URL_WITH_COMMAS_REGEX } from '@crawlee/utils'; vitest.mock('@crawlee/utils/src/internals/gotScraping', async () => { return { @@ -22,13 +18,18 @@ const gotScrapingSpy = vitest.mocked(gotScraping); describe('downloadListOfUrls()', () => { test('downloads a list of URLs', async () => { const text = fs.readFileSync(path.join(baseDataPath, 'simple_url_list.txt'), 'utf8'); - const arr = text.trim().split(/[\r\n]+/g).map((u) => u.trim()); + const arr = text + .trim() + .split(/[\r\n]+/g) + .map((u) => u.trim()); gotScrapingSpy.mockResolvedValueOnce({ body: text }); - await expect(downloadListOfUrls({ - url: 'http://www.nowhere12345.com', - })).resolves.toEqual(arr); + await expect( + downloadListOfUrls({ + url: 'http://www.nowhere12345.com', + }), + ).resolves.toEqual(arr); }); }); @@ -41,16 +42,21 @@ describe('extractUrls()', () => { const getURLData = (filename: string) => { const string = fs.readFileSync(path.join(baseDataPath, filename), 'utf8'); - const array = string.trim().split(/[\r\n]+/g).map((u) => u.trim()); + const array = string + .trim() + .split(/[\r\n]+/g) + .map((u) => u.trim()); return { string, array }; }; - const makeJSON = ({ string, array }: { string: string; array: string[] }) => JSON.stringify({ - one: [{ http: string }], - two: array.map((url) => ({ num: 123, url })), - }); + const makeJSON = ({ string, array }: { string: string; array: string[] }) => + JSON.stringify({ + one: [{ http: string }], + two: array.map((url) => ({ num: 123, url })), + }); - const makeCSV = (array: string[], delimiter?: string) => array.map((url) => ['ABC', 233, url, '.'].join(delimiter || ',')).join('\n'); + const makeCSV = (array: string[], delimiter?: string) => + array.map((url) => ['ABC', 233, url, '.'].join(delimiter || ',')).join('\n'); const makeText = (array: string[]) => { const text = fs.readFileSync(path.join(baseDataPath, 'lipsum.txt'), 'utf8').split(''); diff --git a/test/utils/general.test.ts b/test/utils/general.test.ts index 1c64f34e77c9..a6760fe93c00 100644 --- a/test/utils/general.test.ts +++ b/test/utils/general.test.ts @@ -6,7 +6,9 @@ import { isDocker, weightedAvg, sleep, snakeCaseToCamelCase } from '@crawlee/uti describe('isDocker()', () => { test('works for dockerenv && cgroup', async () => { const statMock = vitest.spyOn(asyncFs, 'stat').mockImplementationOnce(async () => Promise.resolve(null)); - const readMock = vitest.spyOn(asyncFs, 'readFile').mockImplementationOnce(async () => Promise.resolve('something ... docker ... something')); + const readMock = vitest + .spyOn(asyncFs, 'readFile') + .mockImplementationOnce(async () => Promise.resolve('something ... docker ... something')); const is = await isDocker(true); @@ -15,7 +17,9 @@ describe('isDocker()', () => { test('works for dockerenv', async () => { const statMock = vitest.spyOn(asyncFs, 'stat').mockImplementationOnce(async () => Promise.resolve(null)); - const readMock = vitest.spyOn(asyncFs, 'readFile').mockImplementationOnce(async () => Promise.resolve('something ... ... something')); + const readMock = vitest + .spyOn(asyncFs, 'readFile') + .mockImplementationOnce(async () => Promise.resolve('something ... ... something')); const is = await isDocker(true); @@ -23,8 +27,12 @@ describe('isDocker()', () => { }); test('works for cgroup', async () => { - const statMock = vitest.spyOn(asyncFs, 'stat').mockImplementationOnce(async () => Promise.reject(new Error('no.'))); - const readMock = vitest.spyOn(asyncFs, 'readFile').mockImplementationOnce(async () => Promise.resolve('something ... docker ... something')); + const statMock = vitest + .spyOn(asyncFs, 'stat') + .mockImplementationOnce(async () => Promise.reject(new Error('no.'))); + const readMock = vitest + .spyOn(asyncFs, 'readFile') + .mockImplementationOnce(async () => Promise.resolve('something ... docker ... something')); const is = await isDocker(true); @@ -32,8 +40,12 @@ describe('isDocker()', () => { }); test('works for nothing', async () => { - const statMock = vitest.spyOn(asyncFs, 'stat').mockImplementationOnce(async () => Promise.reject(new Error('no.'))); - const readMock = vitest.spyOn(asyncFs, 'readFile').mockImplementationOnce(async () => Promise.resolve('something ... ... something')); + const statMock = vitest + .spyOn(asyncFs, 'stat') + .mockImplementationOnce(async () => Promise.reject(new Error('no.'))); + const readMock = vitest + .spyOn(asyncFs, 'readFile') + .mockImplementationOnce(async () => Promise.resolve('something ... ... something')); const is = await isDocker(true); @@ -46,7 +58,7 @@ describe('weightedAvg()', () => { expect(weightedAvg([10, 10, 10], [1, 1, 1])).toBe(10); expect(weightedAvg([5, 10, 15], [1, 1, 1])).toBe(10); expect(weightedAvg([10, 10, 10], [0.5, 1, 1.5])).toBe(10); - expect(weightedAvg([29, 35, 89], [13, 91, 3])).toEqual(((29 * 13) + (35 * 91) + (89 * 3)) / (13 + 91 + 3)); + expect(weightedAvg([29, 35, 89], [13, 91, 3])).toEqual((29 * 13 + 35 * 91 + 89 * 3) / (13 + 91 + 3)); expect(weightedAvg([], [])).toEqual(NaN); expect(weightedAvg([1], [0])).toEqual(NaN); expect(weightedAvg([], [1])).toEqual(NaN); @@ -72,10 +84,10 @@ describe('sleep()', () => { describe('snakeCaseToCamelCase()', () => { test('should camel case all sneaky cases of snake case', () => { const tests = { - 'aaa_bbb_': 'aaaBbb', + aaa_bbb_: 'aaaBbb', '': '', - 'AaA_bBb_cCc': 'aaaBbbCcc', - 'a_1_b_1a': 'a1B1a', + AaA_bBb_cCc: 'aaaBbbCcc', + a_1_b_1a: 'a1B1a', }; Object.entries(tests).forEach(([snakeCase, camelCase]) => { diff --git a/test/utils/social.test.ts b/test/utils/social.test.ts index 5f78f632f79a..11f1c38661b6 100644 --- a/test/utils/social.test.ts +++ b/test/utils/social.test.ts @@ -47,30 +47,33 @@ describe('utils.social', () => { test('extracts emails correctly', () => { testEmailsFromText(' info@example.com ', ['info@example.com']); - testEmailsFromText(` + testEmailsFromText( + ` info@example.com info+something@example.NET john.bob.dole@some-domain.co.uk - `, [ - 'info@example.com', - 'info+something@example.NET', - 'john.bob.dole@some-domain.co.uk', - ]); + `, + ['info@example.com', 'info+something@example.NET', 'john.bob.dole@some-domain.co.uk'], + ); - testEmailsFromText(` + testEmailsFromText( + ` this'is'also'valid'email@EXAMPLE.travel easy-address@some-domain.co.uk \n\n easy-address@some-domain.co.uk not @ an.email.com @also.not.an.email - `, [ - 'this\'is\'also\'valid\'email@EXAMPLE.travel', - 'easy-address@some-domain.co.uk', - 'easy-address@some-domain.co.uk', - ]); + `, + [ + "this'is'also'valid'email@EXAMPLE.travel", + 'easy-address@some-domain.co.uk', + 'easy-address@some-domain.co.uk', + ], + ); - testEmailsFromText(' some.super.long.email.address@some.super.long.domain.name.co.br ', - ['some.super.long.email.address@some.super.long.domain.name.co.br']); + testEmailsFromText(' some.super.long.email.address@some.super.long.domain.name.co.br ', [ + 'some.super.long.email.address@some.super.long.domain.name.co.br', + ]); }); }); @@ -102,35 +105,29 @@ describe('utils.social', () => { // @ts-expect-error invalid input type expect(emailsFromUrls([1, 2, {}, 'fwef', null, undefined])).toEqual([]); - expect(emailsFromUrls([ - 'mailto:info@example.com', - ])).toEqual([ - 'info@example.com', - ]); - - expect(emailsFromUrls([ - 'http://www.example.com', - 'mailto:info@example.com', - 'mailto:info@example.com', - 'email.without.mailto.prefix@example.com', - '', - '\n\n\n', - ])).toEqual([ - 'info@example.com', - 'info@example.com', - ]); - - expect(emailsFromUrls([ - 'http://www.example.com', - 'mailto:info@example.com', - 'mailto:info@example.com', - 'email.without.mailto.prefix@example.com', - '', - '\n\n\n', - ])).toEqual([ - 'info@example.com', - 'info@example.com', - ]); + expect(emailsFromUrls(['mailto:info@example.com'])).toEqual(['info@example.com']); + + expect( + emailsFromUrls([ + 'http://www.example.com', + 'mailto:info@example.com', + 'mailto:info@example.com', + 'email.without.mailto.prefix@example.com', + '', + '\n\n\n', + ]), + ).toEqual(['info@example.com', 'info@example.com']); + + expect( + emailsFromUrls([ + 'http://www.example.com', + 'mailto:info@example.com', + 'mailto:info@example.com', + 'email.without.mailto.prefix@example.com', + '', + '\n\n\n', + ]), + ).toEqual(['info@example.com', 'info@example.com']); }); }); @@ -153,7 +150,8 @@ describe('utils.social', () => { }); test('extracts phones correctly', () => { - testPhonesFromText(` + testPhonesFromText( + ` +420775123456 +420775123456 +420 775 123 456 @@ -162,88 +160,97 @@ describe('utils.social', () => { 00420775123456 1234567 1234567890 +44 7911 123456 - `, [ - '+420775123456', - '+420775123456', - '+420 775 123 456', - '775123456', - '775123456', - '00420775123456', - '1234567', - '1234567890', - '+44 7911 123456', - ]); + `, + [ + '+420775123456', + '+420775123456', + '+420 775 123 456', + '775123456', + '775123456', + '00420775123456', + '1234567', + '1234567890', + '+44 7911 123456', + ], + ); - testPhonesFromText(` + testPhonesFromText( + ` 413-577-1234 00413-577-1234 981-413-777-8888 413.233.2343 +413.233.2343 or 413 233 2343 562-3113 123456789 401 311 7898 123456789 - `, [ - '413-577-1234', - '00413-577-1234', - '981-413-777-8888', - '413.233.2343', - '+413.233.2343', - '413 233 2343', - '562-3113', - '123456789', - '401 311 7898', - '123456789', - ]); + `, + [ + '413-577-1234', + '00413-577-1234', + '981-413-777-8888', + '413.233.2343', + '+413.233.2343', + '413 233 2343', + '562-3113', + '123456789', + '401 311 7898', + '123456789', + ], + ); - testPhonesFromText(` + testPhonesFromText( + ` 1 (413) 555-2378 +1 (413) 555-2378 1(413)555-2378 001 (413) 555-2378 1 (413) 555 2378 1(413)555-2378 or 1(413)555.2378 or 1 (413) 555-2378 or 1 (413) 555 2378 or (303) 494-2320 - `, [ - '1 (413) 555-2378', - '+1 (413) 555-2378', - '1(413)555-2378', - '001 (413) 555-2378', - '1 (413) 555 2378', - '1(413)555-2378', - '1(413)555.2378', - '1 (413) 555-2378', - '1 (413) 555 2378', - '(303) 494-2320', - ]); + `, + [ + '1 (413) 555-2378', + '+1 (413) 555-2378', + '1(413)555-2378', + '001 (413) 555-2378', + '1 (413) 555 2378', + '1(413)555-2378', + '1(413)555.2378', + '1 (413) 555-2378', + '1 (413) 555 2378', + '(303) 494-2320', + ], + ); - testPhonesFromText(` + testPhonesFromText( + ` 123-456-789 123 456 789 123.456.789 123.456.789.123 +123.456.789.123 - `, [ - '123-456-789', - '123 456 789', - '123.456.789', - '123.456.789.123', - '+123.456.789.123', - ]); + `, + ['123-456-789', '123 456 789', '123.456.789', '123.456.789.123', '+123.456.789.123'], + ); - testPhonesFromText(` + testPhonesFromText( + ` (000)000-0000 (000)000 0000 (000)000.0000 (000) 000-0000 (000) 000 0000 (000) 000.0000 - `, [ - '(000)000-0000', - '(000)000 0000', - '(000)000.0000', - '(000) 000-0000', - '(000) 000 0000', - '(000) 000.0000', - ]); + `, + [ + '(000)000-0000', + '(000)000 0000', + '(000)000.0000', + '(000) 000-0000', + '(000) 000 0000', + '(000) 000.0000', + ], + ); - testPhonesFromText(` + testPhonesFromText( + ` 000-0000 000 0000 000.0000 @@ -251,22 +258,20 @@ describe('utils.social', () => { 0000000 0000000000 (000)0000000 - `, [ - '000-0000', - '000 0000', - '000.0000', - '0000000', - '0000000000', - '(000)0000000', - ]); + `, + ['000-0000', '000 0000', '000.0000', '0000000', '0000000000', '(000)0000000'], + ); }); test('skips invalid phones', () => { - testPhonesFromText(` + testPhonesFromText( + ` 2018-10-11 123 456789 345 1 2 3 4 5 6 7 8 - `, []); + `, + [], + ); }); }); @@ -298,20 +303,22 @@ describe('utils.social', () => { // @ts-expect-error invalid input type expect(phonesFromUrls([1, 2, {}, 'fwef', null, undefined])).toEqual([]); - expect(phonesFromUrls([ - 'tel:12345678', - 'tel:/22345678', // - 'tel://32345678', - 'PHONE:42345678', // - 'phone:/52345678', - 'phone://62345678', - 'telephone:72345678', - 'telephone:/82345678', - 'telephone://92345678', - 'callto:97345678', - 'CALLTO:/+98345678', - 'callto://9992345678', - ])).toEqual([ + expect( + phonesFromUrls([ + 'tel:12345678', + 'tel:/22345678', // + 'tel://32345678', + 'PHONE:42345678', // + 'phone:/52345678', + 'phone://62345678', + 'telephone:72345678', + 'telephone:/82345678', + 'telephone://92345678', + 'callto:97345678', + 'CALLTO:/+98345678', + 'callto://9992345678', + ]), + ).toEqual([ '12345678', '22345678', '32345678', @@ -326,18 +333,17 @@ describe('utils.social', () => { '9992345678', ]); - expect(phonesFromUrls([ - 'https://www.example.com', - 'ftp://www.example.com', - '1234567', - '+42055555567', - 'tel://+42012345678', - 'tel://+420.123.456', - 'http://www.example.com', - ])).toEqual([ - '+42012345678', - '+420.123.456', - ]); + expect( + phonesFromUrls([ + 'https://www.example.com', + 'ftp://www.example.com', + '1234567', + '+42055555567', + 'tel://+42012345678', + 'tel://+420.123.456', + 'http://www.example.com', + ]), + ).toEqual(['+42012345678', '+420.123.456']); }); }); @@ -370,8 +376,9 @@ describe('utils.social', () => { test('works', () => { expect(parseHandlesFromHtml('')).toEqual(EMPTY_RESULT); expect(parseHandlesFromHtml(' ')).toEqual(EMPTY_RESULT); - const html = 'use the data in this [YouTube Video](https://www.youtube.com/watch?v=BsidLZKdYWQ).\\n\\n## Sample result\\n' - + 'use the data in this [YouTube Video](https://www.youtube.com/watch?v=BsidLZKd123).\\\\n\\\\n## Sample result\\\\n'; + const html = + 'use the data in this [YouTube Video](https://www.youtube.com/watch?v=BsidLZKdYWQ).\\n\\n## Sample result\\n' + + 'use the data in this [YouTube Video](https://www.youtube.com/watch?v=BsidLZKd123).\\\\n\\\\n## Sample result\\\\n'; expect(parseHandlesFromHtml(html)).toMatchObject({ youtubes: [ 'https://www.youtube.com/watch?v=BsidLZKd123', @@ -379,7 +386,8 @@ describe('utils.social', () => { ], }); - expect(parseHandlesFromHtml(` + expect( + parseHandlesFromHtml(` Bla @@ -447,11 +455,9 @@ describe('utils.social', () => { Join our Discord community - `)).toEqual({ - discords: [ - 'discord.gg/discord-developers', - 'https://discord.com/invite/jyEM2PRvMU/', - ], + `), + ).toEqual({ + discords: ['discord.gg/discord-developers', 'https://discord.com/invite/jyEM2PRvMU/'], emails: ['alice@example.com', 'bob@example.com', 'carl@example.com', 'david@example.com'], phones: ['+42077533333'], phonesUncertain: [ @@ -491,11 +497,7 @@ describe('utils.social', () => { 'https://www.tiktok.com/trending?shareId=1234567890123456789/', 'm.tiktok.com/v/1234567890123456789', ], - twitters: [ - 'https://www.twitter.com/apify', - 'twitter.com/betasomething', - 'twitter.com/cblabla/', - ], + twitters: ['https://www.twitter.com/apify', 'twitter.com/betasomething', 'twitter.com/cblabla/'], facebooks: [ 'facebook.com/carl.username123/', 'fb.com/dada5678', @@ -503,15 +505,14 @@ describe('utils.social', () => { 'https://www.facebook.com/bob.username123/', 'https://www.facebook.com/profile.php?id=1155802082', ], - youtubes: [ - 'https://youtu.be/kM7YfhfkiEE', - ], + youtubes: ['https://youtu.be/kM7YfhfkiEE'], }); }); test('data is set correctly', () => { const data = {} as any; - parseHandlesFromHtml(` + parseHandlesFromHtml( + ` Bla @@ -520,7 +521,9 @@ describe('utils.social', () => { Body content - `, data); + `, + data, + ); expect(data.$('body').text().trim()).toBe('Body content'); expect(data.text.trim()).toBe('Body content'); @@ -549,7 +552,10 @@ describe('utils.social', () => { expect(EMAIL_REGEX.test('dummy')).toBe(false); expect(EMAIL_REGEX_GLOBAL.test('bob@example.com')).toBe(true); - expect('bob@example.com alice@example.com'.match(EMAIL_REGEX_GLOBAL)).toEqual(['bob@example.com', 'alice@example.com']); + expect('bob@example.com alice@example.com'.match(EMAIL_REGEX_GLOBAL)).toEqual([ + 'bob@example.com', + 'alice@example.com', + ]); expect(''.match(EMAIL_REGEX_GLOBAL)).toBe(null); expect(' dummy '.match(EMAIL_REGEX_GLOBAL)).toBe(null); @@ -598,29 +604,32 @@ describe('utils.social', () => { expect(LINKEDIN_REGEX.test('0linkedin.com/in/bobnewman')).toBe(false); expect(LINKEDIN_REGEX.test('https://www.linkedin.com/in/bobnewman/?param=bla')).toBe(false); expect(LINKEDIN_REGEX.test('://linkedin.com/in/bobnewman')).toBe(false); - expect(LINKEDIN_REGEX.test('https://www.linkedin.com/in/bob https://www.linkedin.com/in/alice')).toBe(false); + expect(LINKEDIN_REGEX.test('https://www.linkedin.com/in/bob https://www.linkedin.com/in/alice')).toBe( + false, + ); expect(LINKEDIN_REGEX_GLOBAL.test('https://www.linkedin.com/in/bobnewman')).toBe(true); - expect(` + expect( + ` https://www.linkedin.com/in/bobnewman "http://ie.linkedin.com/in/alicenewman" https://www.linkedin.com/in/someverylongnamesomeverylongnamesomeverylongnamesomeverylongnamesomeverylongnamesomeverylongname linkedin.com/in/carlnewman - `.match(LINKEDIN_REGEX_GLOBAL)).toEqual([ + `.match(LINKEDIN_REGEX_GLOBAL), + ).toEqual([ 'https://www.linkedin.com/in/bobnewman', 'http://ie.linkedin.com/in/alicenewman', 'linkedin.com/in/carlnewman', ]); - expect(` + expect( + ` -https://www.linkedin.com/in/bobnewman/sub-dir :http://ie.linkedin.com/in/alicenewman?param=1 xlinkedin.com/in/carlnewman alinkedin.com/in/carlnewman _linkedin.com/in/carlnewman - `.match(LINKEDIN_REGEX_GLOBAL)).toEqual([ - 'https://www.linkedin.com/in/bobnewman/', - 'http://ie.linkedin.com/in/alicenewman', - ]); + `.match(LINKEDIN_REGEX_GLOBAL), + ).toEqual(['https://www.linkedin.com/in/bobnewman/', 'http://ie.linkedin.com/in/alicenewman']); expect(''.match(LINKEDIN_REGEX_GLOBAL)).toBe(null); }); }); @@ -667,29 +676,32 @@ describe('utils.social', () => { expect(social.INSTAGRAM_REGEX.test('https://www.instagram.com/_u/')).toBe(false); expect(INSTAGRAM_REGEX.test('https://www.instagram.com/old_prague/?param=bla')).toBe(false); expect(INSTAGRAM_REGEX.test('://www.instagram.com/old_prague')).toBe(false); - expect(INSTAGRAM_REGEX.test('http://www.instagram.com/old_prague http://www.instagram.com/old_brno')).toBe(false); + expect(INSTAGRAM_REGEX.test('http://www.instagram.com/old_prague http://www.instagram.com/old_brno')).toBe( + false, + ); expect(INSTAGRAM_REGEX_GLOBAL.test('https://www.instagram.com/old_prague')).toBe(true); - expect(` + expect( + ` https://www.instagram.com/old_prague https://www.instagram.com/someverylongusernamethatisnotgood "instagram.com/old_brno" http://instagr.am/old_plzen - `.match(INSTAGRAM_REGEX_GLOBAL)).toEqual([ + `.match(INSTAGRAM_REGEX_GLOBAL), + ).toEqual([ 'https://www.instagram.com/old_prague', 'instagram.com/old_brno', 'http://instagr.am/old_plzen', ]); - expect(` + expect( + ` -https://www.instagram.com/old_prague/sub-dir instagr.am/old_plzen?param=1 xinstagram.com/old_brno ainstagram.com/old_brno _instagram.com/old_brno - `.match(INSTAGRAM_REGEX_GLOBAL)).toEqual([ - 'https://www.instagram.com/old_prague/', - 'instagr.am/old_plzen', - ]); + `.match(INSTAGRAM_REGEX_GLOBAL), + ).toEqual(['https://www.instagram.com/old_prague/', 'instagr.am/old_plzen']); expect(''.match(INSTAGRAM_REGEX_GLOBAL)).toBe(null); }); }); @@ -735,28 +747,24 @@ describe('utils.social', () => { expect(TWITTER_REGEX.test('https://www.twitter.com/privacy/')).toBe(false); expect(TWITTER_REGEX_GLOBAL.test('https://www.twitter.com/apify')).toBe(true); - expect(` + expect( + ` https://www.twitter.com/apify www.twitter.com/jack/sub-dir www.twitter.com/invalidverylongtwitterhandlenotgood twitter.com/bob123?param=1 - `.match(TWITTER_REGEX_GLOBAL)).toEqual([ - 'https://www.twitter.com/apify', - 'www.twitter.com/jack/', - 'twitter.com/bob123', - ]); - expect(` + `.match(TWITTER_REGEX_GLOBAL), + ).toEqual(['https://www.twitter.com/apify', 'www.twitter.com/jack/', 'twitter.com/bob123']); + expect( + ` -https://www.twitter.com/apify twitter.com/jack twitter.com/carl123 xtwitter.com/bob atwitter.com/bob _twitter.com/bob - `.match(TWITTER_REGEX_GLOBAL)).toEqual([ - 'https://www.twitter.com/apify', - 'twitter.com/jack', - 'twitter.com/carl123', - ]); + `.match(TWITTER_REGEX_GLOBAL), + ).toEqual(['https://www.twitter.com/apify', 'twitter.com/jack', 'twitter.com/carl123']); expect(''.match(TWITTER_REGEX_GLOBAL)).toBe(null); }); }); @@ -789,7 +797,9 @@ describe('utils.social', () => { // Test there is just on matching group for the username expect('https://www.facebook.com/someusername/'.match(FACEBOOK_REGEX)[1]).toBe('someusername'); expect('https://www.facebook.com/someusername'.match(FACEBOOK_REGEX)[1]).toBe('someusername'); - expect('https://www.facebook.com/profile.php?id=1155802082'.match(FACEBOOK_REGEX)[1]).toBe('profile.php?id=1155802082'); + expect('https://www.facebook.com/profile.php?id=1155802082'.match(FACEBOOK_REGEX)[1]).toBe( + 'profile.php?id=1155802082', + ); expect('fb.com/someusername'.match(FACEBOOK_REGEX)[1]).toBe('someusername'); expect(FACEBOOK_REGEX.test('')).toBe(false); @@ -807,34 +817,32 @@ describe('utils.social', () => { expect(FACEBOOK_REGEX.test('https://www.facebook.com/someusername?param=bla')).toBe(false); expect(FACEBOOK_REGEX.test('://www.facebook.com/someusername')).toBe(false); - expect(FACEBOOK_REGEX.test('https://www.facebook.com/someusername https://www.facebook.com/jack')).toBe(false); + expect(FACEBOOK_REGEX.test('https://www.facebook.com/someusername https://www.facebook.com/jack')).toBe( + false, + ); expect(FACEBOOK_REGEX.test('https://www.facebook.com/groups')).toBe(false); expect(FACEBOOK_REGEX.test('https://www.facebook.com/events')).toBe(false); expect(FACEBOOK_REGEX.test('https://www.facebook.com/policies/')).toBe(false); expect(FACEBOOK_REGEX_GLOBAL.test('https://www.facebook.com/someusername')).toBe(true); - expect(` + expect( + ` https://www.facebook.com/someusername?param=123 www.facebook.com/another123/sub-dir https://www.facebook.com/waytoolongusernamewaytoolongusernamewaytoolongusernamewaytoolongusernamewaytoolongusername fb.com/bob123 - `.match(FACEBOOK_REGEX_GLOBAL)).toEqual([ - 'https://www.facebook.com/someusername', - 'www.facebook.com/another123/', - 'fb.com/bob123', - ]); - expect(` + `.match(FACEBOOK_REGEX_GLOBAL), + ).toEqual(['https://www.facebook.com/someusername', 'www.facebook.com/another123/', 'fb.com/bob123']); + expect( + ` -https://www.facebook.com/someusername/ facebook.com/jack4567 fb.com/carl123 xfacebook.com/bob afacebook.com/bob _facebook.com/bob - `.match(FACEBOOK_REGEX_GLOBAL)).toEqual([ - 'https://www.facebook.com/someusername/', - 'facebook.com/jack4567', - 'fb.com/carl123', - ]); + `.match(FACEBOOK_REGEX_GLOBAL), + ).toEqual(['https://www.facebook.com/someusername/', 'facebook.com/jack4567', 'fb.com/carl123']); expect(''.match(FACEBOOK_REGEX_GLOBAL)).toBe(null); }); }); @@ -856,7 +864,9 @@ describe('utils.social', () => { expect(YOUTUBE_REGEX.test('https://www.youtube.com/user/pewdiepie')).toBe(true); expect(YOUTUBE_REGEX.test('://www.youtube.com/c/TrapNation')).toBe(false); - expect(YOUTUBE_REGEX.test('https://youtu.be/kM7YfhfkiEE https://www.youtube.com/user/pewdiepie')).toBe(false); + expect(YOUTUBE_REGEX.test('https://youtu.be/kM7YfhfkiEE https://www.youtube.com/user/pewdiepie')).toBe( + false, + ); expect(YOUTUBE_REGEX.test('xyoutu.be/kM7YfhfkiEE')).toBe(false); expect(YOUTUBE_REGEX.test('-https://www.youtube.com/user/pewdiepie')).toBe(false); @@ -864,10 +874,13 @@ describe('utils.social', () => { expect('https://www.youtube.com/watch?v=kM7YfhfkiEE'.match(social.YOUTUBE_REGEX)[1]).toBe('kM7YfhfkiEE'); expect('https://youtu.be/kM7YfhfkiEE'.match(social.YOUTUBE_REGEX)[1]).toBe('kM7YfhfkiEE'); expect('https://www.youtube.com/c/TrapNation'.match(social.YOUTUBE_REGEX)[1]).toBe('TrapNation'); - expect('https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA'.match(social.YOUTUBE_REGEX)[1]).toBe('UCklie6BM0fhFvzWYqQVoCTA'); + expect('https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA'.match(social.YOUTUBE_REGEX)[1]).toBe( + 'UCklie6BM0fhFvzWYqQVoCTA', + ); expect('https://www.youtube.com/user/pewdiepie'.match(social.YOUTUBE_REGEX)[1]).toBe('pewdiepie'); - expect(` + expect( + ` https://www.youtube.com/apify/ -https://www.youtube.com/someusername/ youtube.com/jack4567 @@ -878,16 +891,16 @@ describe('utils.social', () => { www.youtube.com/c/TrapNation https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA youtube.com/user/pewdiepie - `.match(YOUTUBE_REGEX_GLOBAL)) - .toEqual([ - 'https://www.youtube.com/apify', - 'https://www.youtube.com/someusername', - 'youtube.com/jack4567', - 'https://www.youtube.com/watch?v=kM7YfhfkiEE', - 'www.youtube.com/c/TrapNation', - 'https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA', - 'youtube.com/user/pewdiepie', - ]); + `.match(YOUTUBE_REGEX_GLOBAL), + ).toEqual([ + 'https://www.youtube.com/apify', + 'https://www.youtube.com/someusername', + 'youtube.com/jack4567', + 'https://www.youtube.com/watch?v=kM7YfhfkiEE', + 'www.youtube.com/c/TrapNation', + 'https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA', + 'youtube.com/user/pewdiepie', + ]); }); }); describe('TIKTOK_REGEX', () => { @@ -913,12 +926,17 @@ describe('utils.social', () => { expect(TIKTOK_REGEX.test('0https://www.tiktok.com/trending?shareId=123456789')).toBe(false); // Test there is just one matching group for video id or username - expect('https://www.tiktok.com/trending?shareId=123456789'.match(TIKTOK_REGEX)[1]).toBe('trending?shareId=123456789'); + expect('https://www.tiktok.com/trending?shareId=123456789'.match(TIKTOK_REGEX)[1]).toBe( + 'trending?shareId=123456789', + ); expect('www.tiktok.com/embed/123456789/'.match(TIKTOK_REGEX)[1]).toBe('embed/123456789'); expect('tiktok.com/@jack'.match(TIKTOK_REGEX)[1]).toBe('@jack'); - expect('https://www.tiktok.com/@username/video/123456789'.match(TIKTOK_REGEX)[1]).toBe('@username/video/123456789'); + expect('https://www.tiktok.com/@username/video/123456789'.match(TIKTOK_REGEX)[1]).toBe( + '@username/video/123456789', + ); - expect(` + expect( + ` https://www.tiktok.com/trending?shareId=123456789 www.tiktok.com/embed/123456789/ m.tiktok.com/v/123456789 @@ -931,17 +949,17 @@ describe('utils.social', () => { https://www.tiktok.com/@jack1234/invalidSubpath/ https://www.tiktok.com/trending?shareId=1234567898904582904537057328079034789063454432789054378 https://www.tiktok.com/@userWithLongVideoName/video/123456789890458290453705732807903478904327890543654645365478 - `.match(TIKTOK_REGEX_GLOBAL)) - .toEqual([ - 'https://www.tiktok.com/trending?shareId=123456789', - 'www.tiktok.com/embed/123456789/', - 'm.tiktok.com/v/123456789', - 'tiktok.com/@user', - 'https://www.tiktok.com/@username/video/123456789', - 'https://www.tiktok.com/@username/video/82347868', - 'https://www.tiktok.com/@jack1234/', - 'https://www.tiktok.com/@userWithLongVideoName/', - ]); + `.match(TIKTOK_REGEX_GLOBAL), + ).toEqual([ + 'https://www.tiktok.com/trending?shareId=123456789', + 'www.tiktok.com/embed/123456789/', + 'm.tiktok.com/v/123456789', + 'tiktok.com/@user', + 'https://www.tiktok.com/@username/video/123456789', + 'https://www.tiktok.com/@username/video/82347868', + 'https://www.tiktok.com/@jack1234/', + 'https://www.tiktok.com/@userWithLongVideoName/', + ]); }); }); @@ -974,7 +992,8 @@ describe('utils.social', () => { expect('pinterest.com/user_name.gold'.match(PINTEREST_REGEX)[1]).toBe('user_name.gold'); expect('https://cz.pinterest.com/username/board'.match(PINTEREST_REGEX)[1]).toBe('username/board'); - expect(` + expect( + ` https://pinterest.com/pin/123456789 -https://pinterest.com/pin/10084556789/ https://www.pinterest.cz/pin/123456789 @@ -984,17 +1003,17 @@ describe('utils.social', () => { pinterest.com/user_name.gold https://cz.pinterest.com/user/board https://www.pinterest.cz/pin/nonNumericPinId - `.match(PINTEREST_REGEX_GLOBAL)) - .toEqual([ - 'https://pinterest.com/pin/123456789', - 'https://pinterest.com/pin/10084556789/', - 'https://www.pinterest.cz/pin/123456789', - 'https://www.pinterest.com/user', - 'https://uk.pinterest.com/user', - 'https://www.pinterest.co.uk/user', - 'pinterest.com/user_name.gold', - 'https://cz.pinterest.com/user/board', - ]); + `.match(PINTEREST_REGEX_GLOBAL), + ).toEqual([ + 'https://pinterest.com/pin/123456789', + 'https://pinterest.com/pin/10084556789/', + 'https://www.pinterest.cz/pin/123456789', + 'https://www.pinterest.com/user', + 'https://uk.pinterest.com/user', + 'https://www.pinterest.co.uk/user', + 'pinterest.com/user_name.gold', + 'https://cz.pinterest.com/user/board', + ]); }); }); @@ -1012,8 +1031,12 @@ describe('utils.social', () => { expect(DISCORD_REGEX.test('https://discord.gg/discord-developers')).toBe(true); expect(DISCORD_REGEX.test('https://discord.com/invite/jyEM2PRvMU')).toBe(true); expect(DISCORD_REGEX.test('https://discordapp.com/channels/231496023303957476')).toBe(true); - expect(DISCORD_REGEX.test('https://discord.com/channels/231496023303957476/2332823543826404586')).toBe(true); - expect(DISCORD_REGEX.test('https://ptb.discord.com/channels/231496023303957476/2332823543826404586')).toBe(true); + expect(DISCORD_REGEX.test('https://discord.com/channels/231496023303957476/2332823543826404586')).toBe( + true, + ); + expect(DISCORD_REGEX.test('https://ptb.discord.com/channels/231496023303957476/2332823543826404586')).toBe( + true, + ); expect(DISCORD_REGEX.test('ptb.discord.com/invite/jyEM2PRvMU')).toBe(true); expect(DISCORD_REGEX.test('canary.discord.com/invite/jyEM2PRvMU')).toBe(true); @@ -1022,26 +1045,35 @@ describe('utils.social', () => { expect(DISCORD_REGEX.test('-discordapp.com/channels/231496023303957476/')).toBe(false); // Test there is just on matching group for the channel or invite (matches discord.* / discordapp.* prefix as well as they differ) - expect('https://discord.gg/discord-developers'.match(DISCORD_REGEX)[1]).toBe('discord.gg/discord-developers'); - expect('https://discord.com/invite/jyEM2PRvMU'.match(DISCORD_REGEX)[1]).toBe('discord.com/invite/jyEM2PRvMU'); - expect('https://discordapp.com/channels/231496023303957476'.match(DISCORD_REGEX)[1]).toBe('discordapp.com/channels/231496023303957476'); - expect('https://discord.com/channels/231496023303957476/2332823543826404586'.match(DISCORD_REGEX)[1]).toBe('discord.com/channels/231496023303957476/2332823543826404586'); - - expect(` + expect('https://discord.gg/discord-developers'.match(DISCORD_REGEX)[1]).toBe( + 'discord.gg/discord-developers', + ); + expect('https://discord.com/invite/jyEM2PRvMU'.match(DISCORD_REGEX)[1]).toBe( + 'discord.com/invite/jyEM2PRvMU', + ); + expect('https://discordapp.com/channels/231496023303957476'.match(DISCORD_REGEX)[1]).toBe( + 'discordapp.com/channels/231496023303957476', + ); + expect('https://discord.com/channels/231496023303957476/2332823543826404586'.match(DISCORD_REGEX)[1]).toBe( + 'discord.com/channels/231496023303957476/2332823543826404586', + ); + + expect( + ` https://discord.gg/discord-developers/ https://discord.com/invite/jyEM2PRvMU -https://discordapp.com/channels/231496023303957476/ https://discord.com/channels/231496023303957476/2332823543826404586 discord.gg/discord-developers https://discordapp.com/channels/nonNumbericChannelId - `.match(DISCORD_REGEX_GLOBAL)) - .toEqual([ - 'https://discord.gg/discord-developers/', - 'https://discord.com/invite/jyEM2PRvMU', - 'https://discordapp.com/channels/231496023303957476/', - 'https://discord.com/channels/231496023303957476/2332823543826404586', - 'discord.gg/discord-developers', - ]); + `.match(DISCORD_REGEX_GLOBAL), + ).toEqual([ + 'https://discord.gg/discord-developers/', + 'https://discord.com/invite/jyEM2PRvMU', + 'https://discordapp.com/channels/231496023303957476/', + 'https://discord.com/channels/231496023303957476/2332823543826404586', + 'discord.gg/discord-developers', + ]); }); }); }); diff --git a/tsconfig.build.json b/tsconfig.build.json index 6583b7277812..a60757218988 100644 --- a/tsconfig.build.json +++ b/tsconfig.build.json @@ -1,25 +1,16 @@ { - "extends": "@apify/tsconfig", - "compilerOptions": { - "target": "ES2020", - "lib": [ - "ESNext", - "DOM", - "ES2020" - ], - "baseUrl": ".", - "allowJs": true, - "skipLibCheck": true, - "resolveJsonModule": false, - "emitDecoratorMetadata": false, - "module": "Node16", - "moduleResolution": "Node16" - }, - "include": [ - "./packages/*/src/**/*" - ], - "exclude": [ - "**/node_modules", - "**/dist" - ] + "extends": "@apify/tsconfig", + "compilerOptions": { + "target": "ES2020", + "lib": ["ESNext", "DOM", "ES2020"], + "baseUrl": ".", + "allowJs": true, + "skipLibCheck": true, + "resolveJsonModule": false, + "emitDecoratorMetadata": false, + "module": "Node16", + "moduleResolution": "Node16" + }, + "include": ["./packages/*/src/**/*"], + "exclude": ["**/node_modules", "**/dist"] } diff --git a/turbo.json b/turbo.json index 5ff0b5c053ee..43a29aeb0068 100644 --- a/turbo.json +++ b/turbo.json @@ -1,24 +1,16 @@ { - "pipeline": { - "build": { - "dependsOn": [ - "^build" - ], - "outputs": [ - "dist/**" - ] - }, - "copy": { - "dependsOn": [ - "^copy" - ], - "outputs": [ - "dist/**" - ] - }, - "clean": { - "cache": false, - "outputs": [] - } - } + "pipeline": { + "build": { + "dependsOn": ["^build"], + "outputs": ["dist/**"] + }, + "copy": { + "dependsOn": ["^copy"], + "outputs": ["dist/**"] + }, + "clean": { + "cache": false, + "outputs": [] + } + } } diff --git a/vitest.config.ts b/vitest.config.ts index aff67df59fac..4fe7bfd0074f 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -24,11 +24,7 @@ export default defineConfig({ coverage: { provider: 'v8', reporter: ['text', 'lcov', 'cobertura'], - exclude: [ - '**/node_modules/**', - '**/dist/**', - '**/test/**', - ], + exclude: ['**/node_modules/**', '**/dist/**', '**/test/**'], }, restoreMocks: true, ...threads,