From 8d6946c8ba419a021ba32c800efdd25fe05df7e1 Mon Sep 17 00:00:00 2001 From: Sviatozar Petrenko Date: Thu, 24 Aug 2023 18:03:07 +0300 Subject: [PATCH] refactor: make `purgeDefaultStorages` accept `options` with `onlyPurgeOnce` (#2044) Adds `onlyPurgeOnce` option to `purgeDefaultStorages` so that storages can be purged multiple times. For now, it preserves backwards-compatible behaviour, and all callers in crawlee explicitly set it to `true` --- .../src/internals/basic-crawler.ts | 2 +- packages/core/src/storages/dataset.ts | 2 +- packages/core/src/storages/key_value_store.ts | 2 +- packages/core/src/storages/request_list.ts | 2 +- packages/core/src/storages/request_queue.ts | 2 +- packages/core/src/storages/utils.ts | 39 ++++++++++++++++++- 6 files changed, 42 insertions(+), 7 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 90ff27c73e77..afc1f7ef7fac 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -775,7 +775,7 @@ export class BasicCrawler { options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); - await purgeDefaultStorages(options.config, options.storageClient); + await purgeDefaultStorages({ onlyPurgeOnce: true, client: options.storageClient, config: options.config }); const manager = StorageManager.getManager>(this, options.config); diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 47f28b3e8e4c..6d013a856ac4 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -417,7 +417,7 @@ export class KeyValueStore { options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); - await purgeDefaultStorages(options.config, options.storageClient); + await purgeDefaultStorages({ onlyPurgeOnce: true, client: options.storageClient, config: options.config }); const manager = StorageManager.getManager(this, options.config); diff --git a/packages/core/src/storages/request_list.ts b/packages/core/src/storages/request_list.ts index 443bc777cb5c..fc4cf5b08ea6 100644 --- a/packages/core/src/storages/request_list.ts +++ b/packages/core/src/storages/request_list.ts @@ -345,7 +345,7 @@ export class RequestList { } this.isLoading = true; - await purgeDefaultStorages(); + await purgeDefaultStorages({ onlyPurgeOnce: true }); const [state, persistedRequests] = await this._loadStateAndPersistedRequests(); diff --git a/packages/core/src/storages/request_queue.ts b/packages/core/src/storages/request_queue.ts index 89c1c39b0803..36a2ee4b7a7d 100644 --- a/packages/core/src/storages/request_queue.ts +++ b/packages/core/src/storages/request_queue.ts @@ -954,7 +954,7 @@ export class RequestQueue { options.config ??= Configuration.getGlobalConfig(); options.storageClient ??= options.config.getStorageClient(); - await purgeDefaultStorages(options.config, options.storageClient); + await purgeDefaultStorages({ onlyPurgeOnce: true, client: options.storageClient, config: options.config }); const manager = StorageManager.getManager(this, options.config); const queue = await manager.openStorage(queueIdOrName, options.storageClient); diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index 651531da8e9e..7140b54c56b9 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -3,6 +3,12 @@ import type { Dictionary, StorageClient } from '@crawlee/types'; import { KeyValueStore } from './key_value_store'; import { Configuration } from '../configuration'; +interface PurgeDefaultStorageOptions { + onlyPurgeOnce?: boolean; + config?: Configuration; + client?: StorageClient; +} + /** * Cleans up the local storage folder (defaults to `./storage`) created when running code locally. * Purging will remove all the files in all storages except for INPUT.json in the default KV store. @@ -16,10 +22,39 @@ import { Configuration } from '../configuration'; * this method will make sure the storage is purged only once for a given execution context, so it is safe to call * it multiple times. */ -export async function purgeDefaultStorages(config = Configuration.getGlobalConfig(), client: StorageClient = config.getStorageClient()) { +export async function purgeDefaultStorages(config?: Configuration, client?: StorageClient): Promise; +/** + * Cleans up the local storage folder (defaults to `./storage`) created when running code locally. + * Purging will remove all the files in all storages except for INPUT.json in the default KV store. + * + * Purging of storages is happening automatically when we run our crawler (or when we open some storage + * explicitly, e.g. via `RequestList.open()`). We can disable that via `purgeOnStart` {@apilink Configuration} + * option or by setting `CRAWLEE_PURGE_ON_START` environment variable to `0` or `false`. + * + * This is a shortcut for running (optional) `purge` method on the StorageClient interface, in other words + * it will call the `purge` method of the underlying storage implementation we are currently using. In addition, + * this method will make sure the storage is purged only once for a given execution context, so it is safe to call + * it multiple times, unless you set `onlyPurgeOnce` to `false` in the `options` object + */ +export async function purgeDefaultStorages(options?: PurgeDefaultStorageOptions): Promise; +export async function purgeDefaultStorages( + configOrOptions?: Configuration | PurgeDefaultStorageOptions, + client?: StorageClient, +) { + const options: PurgeDefaultStorageOptions = configOrOptions instanceof Configuration ? { + client, + config: configOrOptions, + } : configOrOptions ?? {}; + const { + config = Configuration.getGlobalConfig(), + onlyPurgeOnce = true, + } = options; + ({ client = config.getStorageClient() } = options); + const casted = client as StorageClient & { __purged?: boolean }; - if (config.get('purgeOnStart') && !casted.__purged) { + // if `onlyPurgeOnce` is true, will purge anytime this function is called, otherwise - only on start + if (!onlyPurgeOnce || (config.get('purgeOnStart') && !casted.__purged)) { casted.__purged = true; await casted.purge?.(); }