Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

catch misconfigured hf dataset #1123

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
Tuple, Union, cast)

import datasets as hf_datasets
import datasets.exceptions as hf_exceptions
import huggingface_hub as hf_hub
import numpy as np
from composer.utils import dist
Expand All @@ -61,6 +62,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
InvalidPromptTypeError,
InvalidResponseTypeError,
InvalidRoleError,
MisconfiguredHfDatasetError,
NotEnoughChatDataError,
TooManyKeysInExampleError,
UnableToProcessPromptResponseError,
Expand Down Expand Up @@ -838,6 +840,10 @@ def dataset_mapper(example: Dict):
if dist.get_local_rank() == 0:
os.remove(signal_file_path)

if isinstance(error, hf_exceptions.DatasetGenerationError):
log.error('Huggingface DatasetGenerationError during data prep.')
raise MisconfiguredHfDatasetError(dataset_name=dataset_name,
split=split)
if error is not None:
log.error('Error during data prep')
raise error
Expand Down
10 changes: 10 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,13 @@ def __init__(self, output_folder: str) -> None:
self.output_folder = output_folder
message = f'{output_folder} is not empty. Please remove or empty it and retry.'
super().__init__(message)


class MisconfiguredHfDatasetError(ValueError):
"""Error thrown when a HuggingFace dataset is misconfigured."""

def __init__(self, dataset_name: str, split: str) -> None:
self.dataset_name = dataset_name
self.split = split
message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. Please check your dataset config.'
milocress marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(message)
42 changes: 42 additions & 0 deletions tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
InvalidPromptTypeError,
InvalidResponseTypeError,
InvalidRoleError,
MisconfiguredHfDatasetError,
NotEnoughDatasetSamplesError,
TooManyKeysInExampleError,
UnknownExampleTypeError)
Expand Down Expand Up @@ -268,6 +269,47 @@ def test_sequence_id_wrapper(eos_token_id: Optional[int],
raise NotImplementedError()


def test_invalid_jsonl_data():
max_seq_len = 2
decoder_only_format = True
packing_ratio = 'auto'
allow_pad_trimming = False
cfg = {
'name': 'finetuning',
'dataset': {
'hf_name': 'iamroot/chat_malformatted_examples',
'split': 'train',
'max_seq_len': max_seq_len,
'decoder_only_format': decoder_only_format,
'allow_pad_trimming': allow_pad_trimming,
'packing_ratio': packing_ratio,
'shuffle': True,
},
'drop_last': False,
'num_workers': 0,
'pin_memory': False,
'prefetch_factor': None,
'persistent_workers': False,
'timeout': 0
}

cfg = om.create(cfg)

tokenizer = build_tokenizer(
tokenizer_name='gpt2',
tokenizer_kwargs={'model_max_length': max_seq_len})

device_batch_size = 2

expected_keys = ['input_ids', 'attention_mask', 'labels']
if not decoder_only_format:
expected_keys += ['decoder_attention_mask', 'decoder_input_ids']

with pytest.raises(MisconfiguredHfDatasetError):
build_finetuning_dataloader(cfg, tokenizer,
device_batch_size).dataloader


@pytest.mark.parametrize('use_chat_formatting', [True, False])
@pytest.mark.parametrize('decoder_only_format', [True, False])
@pytest.mark.parametrize('allow_pad_trimming', [True, False])
Expand Down
Loading