Skip to content

Commit

Permalink
fix(MemoryStorage): ignore invalid files for request queues (#2132)
Browse files Browse the repository at this point in the history
Closes #1985
  • Loading branch information
vladfrangu authored Oct 17, 2023
1 parent 86647ce commit fa58581
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 1 deletion.
18 changes: 17 additions & 1 deletion packages/memory-storage/src/cache-helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,24 @@ export async function findRequestQueueByPossibleId(client: MemoryStorage, entryN
break;
}
default: {
// Skip non-JSON and files that start with a dot
if (entry.name.startsWith('.') || !entry.name.endsWith('.json')) {
continue;
}

const entryName = entry.name.split('.')[0];
entries.add(entryName);

try {
// Try parsing the file to ensure it's even valid to begin with
const fileContent = await readFile(resolve(requestQueueDir, entry.name), 'utf8');
JSON.parse(fileContent);

entries.add(entryName);
} catch (err) {
memoryStorageLog.warning(
`Request queue entry "${entry.name}" for store ${entryNameOrId} has invalid JSON content and will be ignored from the store.`,
);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import { randomUUID } from 'node:crypto';
import { rm, writeFile } from 'node:fs/promises';
import { resolve } from 'node:path';

import { MemoryStorage } from '@crawlee/memory-storage';
import type { InternalRequest } from '@crawlee/memory-storage/src/resource-clients/request-queue';
import type { RequestSchema } from '@crawlee/types';
import { ensureDir } from 'fs-extra';

describe('when falling back to fs, Request queue should ignore non-JSON files', () => {
const tmpLocation = resolve(__dirname, './tmp/req-queue-ignore-non-json');
const storage = new MemoryStorage({
localDataDirectory: tmpLocation,
});

beforeAll(async () => {
// Create "default" request queue and give it faulty entries
await ensureDir(resolve(storage.requestQueuesDirectory, 'default'));
await writeFile(resolve(storage.requestQueuesDirectory, 'default/__metadata__.json'), JSON.stringify({
id: randomUUID(),
name: 'default',
createdAt: new Date(2022, 0, 1),
accessedAt: new Date(2022, 0, 1),
modifiedAt: new Date(2022, 0, 1),
}));

await writeFile(resolve(storage.requestQueuesDirectory, 'default/123.json'), JSON.stringify({
id: '123',
orderNo: 1,
url: 'http://example.com',
uniqueKey: 'owo',
method: 'GET',
retryCount: 0,
json: JSON.stringify({
uniqueKey: 'owo',
url: 'http://example.com',
id: '123',
} satisfies RequestSchema),
} satisfies InternalRequest));

await writeFile(resolve(storage.requestQueuesDirectory, 'default/.DS_Store'), 'owo');
await writeFile(resolve(storage.requestQueuesDirectory, 'default/invalid.txt'), 'owo');
});

afterAll(async () => {
await rm(tmpLocation, { force: true, recursive: true });
});

test('attempting to list "default" request queue should ignore non-JSON files', async () => {
const defaultQueueInfo = await storage.requestQueues().getOrCreate('default');
const defaultQueue = storage.requestQueue(defaultQueueInfo.id);

expect(defaultQueueInfo.name).toEqual('default');

const requests = await defaultQueue.listHead();
expect(requests.items).toHaveLength(1);
});
});

0 comments on commit fa58581

Please sign in to comment.