diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 3ca127e19252..74616eaf3c8d 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -40,6 +40,7 @@ import { validators, RetryRequestError, SessionError, + Dataset, } from '@crawlee/core'; import type { Dictionary, Awaitable, BatchAddRequestsResult, SetStatusMessageOptions } from '@crawlee/types'; import { ROTATE_PROXY_ERRORS } from '@crawlee/utils'; @@ -874,6 +875,29 @@ export class BasicCrawler): Promise { + const dataset = await Dataset.open(undefined, { config: this.config }); + return dataset.pushData(...args); + } + + /** + * Retrieves the default crawler {@apilink Dataset} by calling {@apilink Dataset.open}. + */ + async getDataset(): Promise { + return Dataset.open(undefined, { config: this.config }); + } + + /** + * Retrieves data from the default crawler {@apilink Dataset} by calling {@apilink Dataset.getData}. + */ + async getData(...args: Parameters): ReturnType { + const dataset = await this.getDataset(); + return dataset.getData(...args); + } + protected async _init(): Promise { if (!this.events.isInitialized()) { await this.events.init(); @@ -1068,6 +1092,9 @@ export class BasicCrawler) => { + return this.pushData(...args); + }, sendRequest: async (overrideOptions?: OptionsInit) => { const cookieJar = session ? { getCookieString: async (url: string) => session!.getCookieString(url), diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index dba824fe4d35..4a9e9c1291e5 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -6,6 +6,7 @@ import type { Log } from '../log'; import type { ProxyInfo } from '../proxy_configuration'; import type { Request } from '../request'; import type { Session } from '../session_pool/session'; +import { type Dataset } from '../storages'; // eslint-disable-next-line @typescript-eslint/ban-types export interface CrawlingContext extends Record { @@ -51,6 +52,15 @@ export interface CrawlingContext; + /** + * This function allows you to push data to the default {@apilink Dataset} currently used by the crawler. + * + * Shortcut for `crawler.pushData()`. + * + * @param [data] Data to be pushed to the default dataset. + */ + pushData(...args: Parameters): Promise; + /** * Fires HTTP request via [`got-scraping`](https://crawlee.dev/docs/guides/got-scraping), allowing to override the request * options on the fly. diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 7d078d997826..cb13ff2d757d 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -1381,4 +1381,56 @@ describe('BasicCrawler', () => { expect(crawler).toBeTruthy(); }); }); + + describe('Dataset helpers, crawler paralellism', () => { + const payload: Dictionary[] = [{ foo: 'bar' }]; + const getPayload: (id: string) => Dictionary[] = (id) => [{ foo: id }]; + + test('should expose default Dataset methods', async () => { + const crawler = new BasicCrawler(); + + await crawler.pushData(payload); + + expect((await crawler.getData()).items) + .toEqual(payload); + }); + + test('should expose pushData helper', async () => { + const crawler = new BasicCrawler({ + requestHandler: ({ pushData }) => pushData(payload), + }); + + await crawler.run([{ + url: `http://${HOSTNAME}:${port}`, + }]); + + expect((await crawler.getData()).items) + .toEqual(payload); + }); + + test("Crawlers with different Configurations don't share Datasets", async () => { + const crawlerA = new BasicCrawler({}, new Configuration({ persistStorage: false })); + const crawlerB = new BasicCrawler({}, new Configuration({ persistStorage: false })); + + await crawlerA.pushData(getPayload('A')); + await crawlerB.pushData(getPayload('B')); + + expect((await crawlerA.getData()).items) + .toEqual(getPayload('A')); + + expect((await crawlerB.getData()).items) + .toEqual(getPayload('B')); + }); + + test('Crawlers with different Configurations run separately', async () => { + const crawlerA = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false })); + const crawlerB = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false })); + + await crawlerA.run([{ url: `http://${HOSTNAME}:${port}` }]); + await crawlerB.run([{ url: `http://${HOSTNAME}:${port}` }]); + + expect(crawlerA.stats.state.requestsFinished).toBe(1); + expect(crawlerB.stats.state.requestsFinished).toBe(1); + }); + }); }); diff --git a/test/core/multiple_crawlers.test.ts b/test/core/multiple_crawlers.test.ts deleted file mode 100644 index cd40f7cb7af0..000000000000 --- a/test/core/multiple_crawlers.test.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { MemoryStorage } from '@crawlee/memory-storage'; -import { CheerioCrawler, Configuration } from 'crawlee'; - -describe('multiple crawlers', () => { - test('Crawler instances with different StorageClients do not affect each other', async () => { - const getCrawler = () => { - return new CheerioCrawler({ - requestHandler: async () => {}, - }, new Configuration({ - storageClient: new MemoryStorage({ - persistStorage: false, - }), - })); - }; - - const a = getCrawler(); - - await a.run([ - { url: 'https://example.org/' }, - ]); - - const b = getCrawler(); - - await b.run([ - { url: 'https://example.org/' }, - ]); - - expect(a.stats.state.requestsFinished).toBe(1); - expect(b.stats.state.requestsFinished).toBe(1); - }); -});