diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 9a1f8a912d..39b7178219 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -29,7 +29,6 @@ merge_shard_groups, ) from llmfoundry.utils.exceptions import ( - CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -68,39 +67,36 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]: buffer = [] for file in self.files: log.info(f'Processing file: {file}') - with open(file, 'r') as f: + with open(file, 'r', errors='replace') as f: buffer += self.bos_tokens first_chunk = True # Read the file in 1MB chunks to avoid memory issues - try: - for chunk in iter(partial(f.read, 1000000), ''): - # Tokenize the chunk - encoded = self.tokenizer( - chunk, - truncation=False, - padding=False, - ) - iids = encoded['input_ids'] - - # If this is not the first chunk, remove the BOS token - if not first_chunk: - if iids[0] == self.tokenizer.bos_token_id: - iids = iids[1:] - - # Add the tokens to the buffer - buffer += iids - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self.max_length: - ] if self.should_wrap else [] - yield { - 'tokens': - np.asarray(concat_sample, dtype=np.int32), - } + for chunk in iter(partial(f.read, 1000000), ''): + # Tokenize the chunk + encoded = self.tokenizer( + chunk, + truncation=False, + padding=False, + ) + iids = encoded['input_ids'] + + # If this is not the first chunk, remove the BOS token + if not first_chunk: + if iids[0] == self.tokenizer.bos_token_id: + iids = iids[1:] + + # Add the tokens to the buffer + buffer += iids + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length: + ] if self.should_wrap else [] + yield { + 'tokens': + np.asarray(concat_sample, dtype=np.int32), + } first_chunk = False - except UnicodeDecodeError: - raise CannotUnicodeDecodeFile(text_file=file) # Add the EOS token to the buffer to separate files. buffer += self.eos_tokens diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 11895564f2..34c215f3ee 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -348,14 +348,6 @@ def __init__(self, input_folder: str) -> None: super().__init__(message, input_folder=input_folder) -class CannotUnicodeDecodeFile(UserError): - """Error thrown when the input folder is missing data.""" - - def __init__(self, text_file: str) -> None: - message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.' - super().__init__(message, text_file=text_file) - - class OutputFolderNotEmptyError(UserError): """Error thrown when the output folder is not empty.""" diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index d604565e59..302a540217 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -22,7 +22,6 @@ write_done_file, ) from llmfoundry.utils.exceptions import ( - CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -291,28 +290,6 @@ def test_dataset_too_small(tmp_path: pathlib.Path): ) -def test_decode_invalid_unicode(tmp_path: pathlib.Path): - input_folder = tmp_path / 'input' - os.makedirs(input_folder, exist_ok=True) - with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f: - f.write('HELLO WORLD') - with pytest.raises(CannotUnicodeDecodeFile): - convert_text_to_mds( - tokenizer_name='mosaicml/mpt-7b', - output_folder=str(tmp_path / 'output'), - input_folder=str(input_folder), - concat_tokens=1, - eos_text='', - bos_text='', - no_wrap=False, - compression='zstd', - processes=1, - args_str='Namespace()', - reprocess=False, - trust_remote_code=False, - ) - - def test_is_already_processed(tmp_path: pathlib.Path): tmp_path_str = str(tmp_path) args_str = 'Namespace(x = 5)'