Skip to content

Commit

Permalink
feat(core): add default dataset helpers to BasicCrawler (#2057)
Browse files Browse the repository at this point in the history
This enables users to do:
```javascript
const crawler = new CheerioCrawler();

await crawler.pushData({test: 'abc'})
const data = await crawler.getData();
```

and use this new `pushData` helper:

```javascript
...
requestHandler: ({ pushData }) {
  await pushData({ title, url });
},
...
```

All the methods respect the current crawler Configuration instance (by
using the appropriate `storageClient`).

---------

Co-authored-by: Martin Adámek <[email protected]>
  • Loading branch information
barjin and B4nan authored Aug 30, 2023
1 parent 3f4c863 commit e2a7544
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 31 deletions.
27 changes: 27 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import {
validators,
RetryRequestError,
SessionError,
Dataset,
} from '@crawlee/core';
import type { Dictionary, Awaitable, BatchAddRequestsResult, SetStatusMessageOptions } from '@crawlee/types';
import { ROTATE_PROXY_ERRORS } from '@crawlee/utils';
Expand Down Expand Up @@ -874,6 +875,29 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
return requestQueue.addRequestsBatched(requests, options);
}

/**
* Pushes data to the default crawler {@apilink Dataset} by calling {@apilink Dataset.pushData}.
*/
async pushData(...args: Parameters<Dataset['pushData']>): Promise<void> {
const dataset = await Dataset.open(undefined, { config: this.config });
return dataset.pushData(...args);
}

/**
* Retrieves the default crawler {@apilink Dataset} by calling {@apilink Dataset.open}.
*/
async getDataset(): Promise<Dataset> {
return Dataset.open(undefined, { config: this.config });
}

/**
* Retrieves data from the default crawler {@apilink Dataset} by calling {@apilink Dataset.getData}.
*/
async getData(...args: Parameters<Dataset['getData']>): ReturnType<Dataset['getData']> {
const dataset = await this.getDataset();
return dataset.getData(...args);
}

protected async _init(): Promise<void> {
if (!this.events.isInitialized()) {
await this.events.init();
Expand Down Expand Up @@ -1068,6 +1092,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
...options,
});
},
pushData: async (...args: Parameters<Dataset['pushData']>) => {
return this.pushData(...args);
},
sendRequest: async (overrideOptions?: OptionsInit) => {
const cookieJar = session ? {
getCookieString: async (url: string) => session!.getCookieString(url),
Expand Down
10 changes: 10 additions & 0 deletions packages/core/src/crawlers/crawler_commons.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import type { Log } from '../log';
import type { ProxyInfo } from '../proxy_configuration';
import type { Request } from '../request';
import type { Session } from '../session_pool/session';
import { type Dataset } from '../storages';

// eslint-disable-next-line @typescript-eslint/ban-types
export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary = Dictionary> extends Record<string & {}, unknown> {
Expand Down Expand Up @@ -51,6 +52,15 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
*/
enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;

/**
* This function allows you to push data to the default {@apilink Dataset} currently used by the crawler.
*
* Shortcut for `crawler.pushData()`.
*
* @param [data] Data to be pushed to the default dataset.
*/
pushData(...args: Parameters<Dataset['pushData']>): Promise<void>;

/**
* Fires HTTP request via [`got-scraping`](https://crawlee.dev/docs/guides/got-scraping), allowing to override the request
* options on the fly.
Expand Down
52 changes: 52 additions & 0 deletions test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1381,4 +1381,56 @@ describe('BasicCrawler', () => {
expect(crawler).toBeTruthy();
});
});

describe('Dataset helpers, crawler paralellism', () => {
const payload: Dictionary<any>[] = [{ foo: 'bar' }];
const getPayload: (id: string) => Dictionary<any>[] = (id) => [{ foo: id }];

test('should expose default Dataset methods', async () => {
const crawler = new BasicCrawler();

await crawler.pushData(payload);

expect((await crawler.getData()).items)
.toEqual(payload);
});

test('should expose pushData helper', async () => {
const crawler = new BasicCrawler({
requestHandler: ({ pushData }) => pushData(payload),
});

await crawler.run([{
url: `http://${HOSTNAME}:${port}`,
}]);

expect((await crawler.getData()).items)
.toEqual(payload);
});

test("Crawlers with different Configurations don't share Datasets", async () => {
const crawlerA = new BasicCrawler({}, new Configuration({ persistStorage: false }));
const crawlerB = new BasicCrawler({}, new Configuration({ persistStorage: false }));

await crawlerA.pushData(getPayload('A'));
await crawlerB.pushData(getPayload('B'));

expect((await crawlerA.getData()).items)
.toEqual(getPayload('A'));

expect((await crawlerB.getData()).items)
.toEqual(getPayload('B'));
});

test('Crawlers with different Configurations run separately', async () => {
const crawlerA = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false }));
const crawlerB = new BasicCrawler({ requestHandler: () => {} }, new Configuration({ persistStorage: false }));

await crawlerA.run([{ url: `http://${HOSTNAME}:${port}` }]);
await crawlerB.run([{ url: `http://${HOSTNAME}:${port}` }]);

expect(crawlerA.stats.state.requestsFinished).toBe(1);
expect(crawlerB.stats.state.requestsFinished).toBe(1);
});
});
});
31 changes: 0 additions & 31 deletions test/core/multiple_crawlers.test.ts

This file was deleted.

0 comments on commit e2a7544

Please sign in to comment.