Skip to content

Commit

Permalink
add back test
Browse files Browse the repository at this point in the history
  • Loading branch information
mattyding committed Sep 16, 2024
1 parent 7ccd6f3 commit 1977c0b
Showing 1 changed file with 24 additions and 0 deletions.
24 changes: 24 additions & 0 deletions tests/a_scripts/data_prep/test_convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,30 @@ def test_dataset_too_small(tmp_path: pathlib.Path):
)


def test_decode_invalid_unicode(tmp_path: pathlib.Path):
input_folder = tmp_path / 'input'
os.makedirs(input_folder, exist_ok=True)
with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
f.write('HELLO WORLD')
try:
convert_text_to_mds(
tokenizer_name='mosaicml/mpt-7b',
output_folder=str(tmp_path / 'output'),
input_folder=str(input_folder),
concat_tokens=1,
eos_text='',
bos_text='',
no_wrap=False,
compression='zstd',
processes=1,
args_str='Namespace()',
reprocess=False,
trust_remote_code=False,
)
except UnicodeDecodeError:
pytest.fail('UnicodeDecodeError raised')


def test_is_already_processed(tmp_path: pathlib.Path):
tmp_path_str = str(tmp_path)
args_str = 'Namespace(x = 5)'
Expand Down

0 comments on commit 1977c0b

Please sign in to comment.