From 52a545dae66ac0b288897dd449ddec9354474e81 Mon Sep 17 00:00:00 2001 From: cadlagtrader Date: Mon, 19 Aug 2024 18:03:56 +0400 Subject: [PATCH] fix item_count double incremented --- .../memory_storage_client/_creation_management.py | 3 ++- .../unit/memory_storage_client/test_dataset_client.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/crawlee/memory_storage_client/_creation_management.py b/src/crawlee/memory_storage_client/_creation_management.py index e347cdc06..b02ca665b 100644 --- a/src/crawlee/memory_storage_client/_creation_management.py +++ b/src/crawlee/memory_storage_client/_creation_management.py @@ -181,6 +181,7 @@ def create_dataset_from_directory( from crawlee.memory_storage_client.dataset_client import DatasetClient item_count = 0 + has_seen_metadata_file = False created_at = datetime.now(timezone.utc) accessed_at = datetime.now(timezone.utc) modified_at = datetime.now(timezone.utc) @@ -189,6 +190,7 @@ def create_dataset_from_directory( metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME) if os.path.exists(metadata_filepath): + has_seen_metadata_file = True with open(metadata_filepath, encoding='utf-8') as f: json_content = json.load(f) resource_info = DatasetMetadata(**json_content) @@ -202,7 +204,6 @@ def create_dataset_from_directory( # Load dataset entries entries: dict[str, dict] = {} - has_seen_metadata_file = False for entry in os.scandir(storage_directory): if entry.is_file(): diff --git a/tests/unit/memory_storage_client/test_dataset_client.py b/tests/unit/memory_storage_client/test_dataset_client.py index 02072f560..31818098d 100644 --- a/tests/unit/memory_storage_client/test_dataset_client.py +++ b/tests/unit/memory_storage_client/test_dataset_client.py @@ -138,3 +138,13 @@ async def test_iterate_items(dataset_client: DatasetClient) -> None: assert len(actual_items) == item_count assert actual_items[0]['id'] == 0 assert actual_items[99]['id'] == 99 + + +async def test_reuse_dataset(dataset_client: DatasetClient, memory_storage_client: MemoryStorageClient) -> None: + item_count = 10 + await dataset_client.push_items([{'id': i} for i in range(item_count)]) + + memory_storage_client.datasets_handled = [] # purge datasets loaded to test create_dataset_from_directory + datasets_client = memory_storage_client.datasets() + dataset_info = await datasets_client.get_or_create(name='test') + assert dataset_info.item_count == item_count