Skip to content

Commit

Permalink
unicode error replace
Browse files Browse the repository at this point in the history
  • Loading branch information
mattyding committed Sep 16, 2024
1 parent a862d6e commit 7ccd6f3
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 60 deletions.
54 changes: 25 additions & 29 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
merge_shard_groups,
)
from llmfoundry.utils.exceptions import (
CannotUnicodeDecodeFile,
DatasetTooSmallError,
InputFolderMissingDataError,
OutputFolderNotEmptyError,
Expand Down Expand Up @@ -68,39 +67,36 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
buffer = []
for file in self.files:
log.info(f'Processing file: {file}')
with open(file, 'r') as f:
with open(file, 'r', errors='replace') as f:
buffer += self.bos_tokens
first_chunk = True
# Read the file in 1MB chunks to avoid memory issues
try:
for chunk in iter(partial(f.read, 1000000), ''):
# Tokenize the chunk
encoded = self.tokenizer(
chunk,
truncation=False,
padding=False,
)
iids = encoded['input_ids']

# If this is not the first chunk, remove the BOS token
if not first_chunk:
if iids[0] == self.tokenizer.bos_token_id:
iids = iids[1:]

# Add the tokens to the buffer
buffer += iids
while len(buffer) >= self.max_length:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:
] if self.should_wrap else []
yield {
'tokens':
np.asarray(concat_sample, dtype=np.int32),
}
for chunk in iter(partial(f.read, 1000000), ''):
# Tokenize the chunk
encoded = self.tokenizer(
chunk,
truncation=False,
padding=False,
)
iids = encoded['input_ids']

# If this is not the first chunk, remove the BOS token
if not first_chunk:
if iids[0] == self.tokenizer.bos_token_id:
iids = iids[1:]

# Add the tokens to the buffer
buffer += iids
while len(buffer) >= self.max_length:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:
] if self.should_wrap else []
yield {
'tokens':
np.asarray(concat_sample, dtype=np.int32),
}

first_chunk = False
except UnicodeDecodeError:
raise CannotUnicodeDecodeFile(text_file=file)

# Add the EOS token to the buffer to separate files.
buffer += self.eos_tokens
Expand Down
8 changes: 0 additions & 8 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,14 +348,6 @@ def __init__(self, input_folder: str) -> None:
super().__init__(message, input_folder=input_folder)


class CannotUnicodeDecodeFile(UserError):
"""Error thrown when the input folder is missing data."""

def __init__(self, text_file: str) -> None:
message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.'
super().__init__(message, text_file=text_file)


class OutputFolderNotEmptyError(UserError):
"""Error thrown when the output folder is not empty."""

Expand Down
23 changes: 0 additions & 23 deletions tests/a_scripts/data_prep/test_convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
write_done_file,
)
from llmfoundry.utils.exceptions import (
CannotUnicodeDecodeFile,
DatasetTooSmallError,
InputFolderMissingDataError,
OutputFolderNotEmptyError,
Expand Down Expand Up @@ -291,28 +290,6 @@ def test_dataset_too_small(tmp_path: pathlib.Path):
)


def test_decode_invalid_unicode(tmp_path: pathlib.Path):
input_folder = tmp_path / 'input'
os.makedirs(input_folder, exist_ok=True)
with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
f.write('HELLO WORLD')
with pytest.raises(CannotUnicodeDecodeFile):
convert_text_to_mds(
tokenizer_name='mosaicml/mpt-7b',
output_folder=str(tmp_path / 'output'),
input_folder=str(input_folder),
concat_tokens=1,
eos_text='',
bos_text='',
no_wrap=False,
compression='zstd',
processes=1,
args_str='Namespace()',
reprocess=False,
trust_remote_code=False,
)


def test_is_already_processed(tmp_path: pathlib.Path):
tmp_path_str = str(tmp_path)
args_str = 'Namespace(x = 5)'
Expand Down

0 comments on commit 7ccd6f3

Please sign in to comment.