Skip to content

Commit

Permalink
fix item_count double incremented
Browse files Browse the repository at this point in the history
  • Loading branch information
cadlagtrader committed Aug 19, 2024
1 parent ffb39e8 commit 52a545d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/crawlee/memory_storage_client/_creation_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ def create_dataset_from_directory(
from crawlee.memory_storage_client.dataset_client import DatasetClient

item_count = 0
has_seen_metadata_file = False
created_at = datetime.now(timezone.utc)
accessed_at = datetime.now(timezone.utc)
modified_at = datetime.now(timezone.utc)
Expand All @@ -189,6 +190,7 @@ def create_dataset_from_directory(
metadata_filepath = os.path.join(storage_directory, METADATA_FILENAME)

if os.path.exists(metadata_filepath):
has_seen_metadata_file = True
with open(metadata_filepath, encoding='utf-8') as f:
json_content = json.load(f)
resource_info = DatasetMetadata(**json_content)
Expand All @@ -202,7 +204,6 @@ def create_dataset_from_directory(

# Load dataset entries
entries: dict[str, dict] = {}
has_seen_metadata_file = False

for entry in os.scandir(storage_directory):
if entry.is_file():
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/memory_storage_client/test_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,13 @@ async def test_iterate_items(dataset_client: DatasetClient) -> None:
assert len(actual_items) == item_count
assert actual_items[0]['id'] == 0
assert actual_items[99]['id'] == 99


async def test_reuse_dataset(dataset_client: DatasetClient, memory_storage_client: MemoryStorageClient) -> None:
item_count = 10
await dataset_client.push_items([{'id': i} for i in range(item_count)])

memory_storage_client.datasets_handled = [] # purge datasets loaded to test create_dataset_from_directory
datasets_client = memory_storage_client.datasets()
dataset_info = await datasets_client.get_or_create(name='test')
assert dataset_info.item_count == item_count

0 comments on commit 52a545d

Please sign in to comment.