From 19059089b16de43920cc2a79b21dd835018edab7 Mon Sep 17 00:00:00 2001 From: Benny Rubanov <106097466+bennyrubanov@users.noreply.github.com> Date: Sun, 24 Mar 2024 13:38:35 +0800 Subject: [PATCH] reverting back to 9abf7b16f11e9a08052d6747663818f5ca406320, then adding in terminal prompts for batch size and file size limit, as well as error catch for missing files --- package.json | 3 +- src/zst_decompressor.ts | 68 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/package.json b/package.json index ecfe5b5..045da9f 100644 --- a/package.json +++ b/package.json @@ -26,6 +26,7 @@ "build": "tsc --build", "start": "tsc --build && node dist/src/index.js", "test": "tsc --build && jest ./dist", - "scratch": "tsc --build && node dist/src/scratch.js" + "scratch": "tsc --build && node dist/src/scratch.js", + "run-zst-decompressor": "tsc --build && node dist/src/zst_decompressor.js" } } diff --git a/src/zst_decompressor.ts b/src/zst_decompressor.ts index f8796b9..644ebac 100644 --- a/src/zst_decompressor.ts +++ b/src/zst_decompressor.ts @@ -1,5 +1,6 @@ import { randomUUID } from 'crypto'; import * as path from 'path'; +import * as readline from 'readline'; // TODO: This should use type checking const fs = require('fs'); @@ -9,7 +10,46 @@ const { spawn } = require('child_process'); // 30 games = 10*1024 bytes, 1 game = 350 bytes, 1000 games = 330KB, 100K games = 33MB // 10MB yields around 30k games, 5GB = around 15 million games // const SIZE_LIMIT = 30 * 1024 * 1024; // 30MB -const SIZE_LIMIT = 0.1 * 1024 * 1024; // 0.5MB, for testing +let SIZE_LIMIT = 10 * 1024 * 1024; // Default 10MB + +const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout +}); + +// Function to prompt for SIZE_LIMIT +const promptForSizeLimit = () => { + return new Promise((resolve) => { + rl.question('Enter the SIZE_LIMIT in MB (default is 10MB): ', (input) => { + const inputSizeMB = parseInt(input, 10); + if (!isNaN(inputSizeMB) && inputSizeMB > 0) { + SIZE_LIMIT = inputSizeMB * 1024 * 1024; // Convert MB to bytes + console.log(`Using SIZE_LIMIT of ${SIZE_LIMIT} bytes.`); + } else { + console.log(`Invalid input. Using default SIZE_LIMIT of ${SIZE_LIMIT} bytes.`); + } + resolve(); + }); + }); +}; + +let concurrentFilesLimit = 10; // How many files are analyzed at one time (batch size) + +// Function to prompt for concurrent files limit +const promptForConcurrentFilesLimit = () => { + return new Promise((resolve) => { + rl.question('Enter the number of files to analyze concurrently (default is 10): ', (input) => { + const inputLimit = parseInt(input, 10); + if (!isNaN(inputLimit) && inputLimit > 0) { + concurrentFilesLimit = inputLimit; + console.log(`Using concurrent files limit of ${concurrentFilesLimit}.`); + } else { + console.log(`Invalid input. Using default concurrent files limit of ${concurrentFilesLimit}.`); + } + resolve(); + }); + }); +}; // set the total size limit of the combined decompressed files (this is how much space you need to have available on your PC prior to running node src/streaming_partial_decompresser.js) const decompressedSizeLimit = 500 * 1024 * 1024 * 1024; // 500 GB represented in bytes @@ -193,7 +233,7 @@ const decompressAndAnalyze = async (file, start = 0) => { result.on('end', async () => { // When all data is decompressed, run the analysis on the produced files concurrently - for (const file of Array.from(filesProduced).slice(0, 5)) { + for (const file of Array.from(filesProduced).slice(0, 10)) { // TODO: this won't work out of the box for a large number of files as there is no max concurrency. But the sample only produces 4 decompressed files // I'm slicing to test this with a smaller number of files @@ -230,7 +270,17 @@ const processFiles = async (files: string[]) => { console.log(`Initiating decompression and analysis of files: ${files}...`); console.time('Final Total Compressed File Analysis Execution Time'); for (const file of files) { - await decompressAndAnalyze(file); + try { + // Check if the file exists before proceeding + const filePath = path.resolve(__dirname, '..', '..', 'data', file); + if (!fs.existsSync(filePath)) { + throw new Error(`File does not exist: ${filePath}`); + } + await decompressAndAnalyze(file); + } catch (error) { + console.error(`Error processing file ${file}: ${error.message}`); + // Optionally, continue with the next file or handle the error as needed + } } console.timeEnd('Final Total Compressed File Analysis Execution Time'); }; @@ -248,7 +298,11 @@ module.exports = processFiles; // run if main if (require.main === module) { - // List of all the database files you want to analyze (these need to be downloaded and in data folder) - const files = ['lichess_db_standard_rated_2013-02.pgn.zst' /*...*/]; - processFiles(files); -} + promptForSizeLimit().then(() => { + promptForConcurrentFilesLimit().then(() => { + rl.close(); // Close the readline interface after all prompts + const files = ['lichess_db_standard_rated_2013-01.pgn.zst' /*...*/]; + processFiles(files); + }); + }); +} \ No newline at end of file