Skip to content

Commit

Permalink
catch misconfigured hf dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Apr 19, 2024
1 parent 20cb40c commit ae807fd
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 0 deletions.
6 changes: 6 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
Tuple, Union, cast)

import datasets as hf_datasets
import datasets.exceptions as hf_exceptions
import huggingface_hub as hf_hub
import numpy as np
from composer.utils import dist
Expand All @@ -51,6 +52,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
from llmfoundry.data.finetuning.collator import (_HF_IGNORE_INDEX,
stitch_turns_decoder_only,
stitch_turns_encoder_decoder)
from llmfoundry.utils import exceptions as foundry_exceptions
# yapf: disable
from llmfoundry.utils.exceptions import (ConsecutiveRepeatedChatRolesError,
IncorrectMessageKeyQuantityError,
Expand Down Expand Up @@ -838,6 +840,10 @@ def dataset_mapper(example: Dict):
if dist.get_local_rank() == 0:
os.remove(signal_file_path)

if isinstance(error, hf_exceptions.DatasetGenerationError):
log.error('Huggingface DatasetGenerationError during data prep')
raise foundry_exceptions.MisconfiguredHfDatasetError(
dataset_name=dataset_name, split=split)
if error is not None:
log.error('Error during data prep')
raise error
Expand Down
10 changes: 10 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,13 @@ def __init__(self, output_folder: str) -> None:
self.output_folder = output_folder
message = f'{output_folder} is not empty. Please remove or empty it and retry.'
super().__init__(message)


class MisconfiguredHfDatasetError(ValueError):
"""Error thrown when a HuggingFace dataset is misconfigured."""

def __init__(self, dataset_name: str, split: str) -> None:
self.dataset_name = dataset_name
self.split = split
message = f'Your dataset (name={dataset_name}, split={split}) is misconfigured. Please check your dataset config.'
super().__init__(message)

0 comments on commit ae807fd

Please sign in to comment.