unicode error replace

mosaicml · Sep 16, 2024 · 7ccd6f3 · 7ccd6f3
1 parent a862d6e
commit 7ccd6f3
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 60 deletions.
diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -29,7 +29,6 @@
     merge_shard_groups,
 )
 from llmfoundry.utils.exceptions import (
-    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -68,39 +67,36 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
         buffer = []
         for file in self.files:
             log.info(f'Processing file: {file}')
-            with open(file, 'r') as f:
+            with open(file, 'r', errors='replace') as f:
                 buffer += self.bos_tokens
                 first_chunk = True
                 # Read the file in 1MB chunks to avoid memory issues
-                try:
-                    for chunk in iter(partial(f.read, 1000000), ''):
-                        # Tokenize the chunk
-                        encoded = self.tokenizer(
-                            chunk,
-                            truncation=False,
-                            padding=False,
-                        )
-                        iids = encoded['input_ids']
-
-                        # If this is not the first chunk, remove the BOS token
-                        if not first_chunk:
-                            if iids[0] == self.tokenizer.bos_token_id:
-                                iids = iids[1:]
-
-                        # Add the tokens to the buffer
-                        buffer += iids
-                        while len(buffer) >= self.max_length:
-                            concat_sample = buffer[:self.max_length]
-                            buffer = buffer[self.max_length:
-                                           ] if self.should_wrap else []
-                            yield {
-                                'tokens':
-                                    np.asarray(concat_sample, dtype=np.int32),
-                            }
+                for chunk in iter(partial(f.read, 1000000), ''):
+                    # Tokenize the chunk
+                    encoded = self.tokenizer(
+                        chunk,
+                        truncation=False,
+                        padding=False,
+                    )
+                    iids = encoded['input_ids']
+
+                    # If this is not the first chunk, remove the BOS token
+                    if not first_chunk:
+                        if iids[0] == self.tokenizer.bos_token_id:
+                            iids = iids[1:]
+
+                    # Add the tokens to the buffer
+                    buffer += iids
+                    while len(buffer) >= self.max_length:
+                        concat_sample = buffer[:self.max_length]
+                        buffer = buffer[self.max_length:
+                                        ] if self.should_wrap else []
+                        yield {
+                            'tokens':
+                                np.asarray(concat_sample, dtype=np.int32),
+                        }
 
                         first_chunk = False
-                except UnicodeDecodeError:
-                    raise CannotUnicodeDecodeFile(text_file=file)
 
                 # Add the EOS token to the buffer to separate files.
                 buffer += self.eos_tokens

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -348,14 +348,6 @@ def __init__(self, input_folder: str) -> None:
         super().__init__(message, input_folder=input_folder)
 
 
-class CannotUnicodeDecodeFile(UserError):
-    """Error thrown when the input folder is missing data."""
-
-    def __init__(self, text_file: str) -> None:
-        message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.'
-        super().__init__(message, text_file=text_file)
-
-
 class OutputFolderNotEmptyError(UserError):
     """Error thrown when the output folder is not empty."""
 

diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -22,7 +22,6 @@
     write_done_file,
 )
 from llmfoundry.utils.exceptions import (
-    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -291,28 +290,6 @@ def test_dataset_too_small(tmp_path: pathlib.Path):
         )
 
 
-def test_decode_invalid_unicode(tmp_path: pathlib.Path):
-    input_folder = tmp_path / 'input'
-    os.makedirs(input_folder, exist_ok=True)
-    with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
-        f.write('HELLO WORLD')
-    with pytest.raises(CannotUnicodeDecodeFile):
-        convert_text_to_mds(
-            tokenizer_name='mosaicml/mpt-7b',
-            output_folder=str(tmp_path / 'output'),
-            input_folder=str(input_folder),
-            concat_tokens=1,
-            eos_text='',
-            bos_text='',
-            no_wrap=False,
-            compression='zstd',
-            processes=1,
-            args_str='Namespace()',
-            reprocess=False,
-            trust_remote_code=False,
-        )
-
-
 def test_is_already_processed(tmp_path: pathlib.Path):
     tmp_path_str = str(tmp_path)
     args_str = 'Namespace(x = 5)'