Fix InvalidPromptResponseKeysError bug

mosaicml · Apr 23, 2024 · 724b668 · 724b668
1 parent 0c6bd75
commit 724b668
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 32 deletions.
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -64,7 +64,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
                                          InvalidRoleError,
                                          MisconfiguredHfDatasetError,
                                          NotEnoughChatDataError,
-                                         TooManyKeysInExampleError,
                                          UnableToProcessPromptResponseError,
                                          UnknownExampleTypeError)
 #  yapf: enable
@@ -108,14 +107,20 @@ def _get_example_type(example: Example) -> ExampleType:
     if not isinstance(example, Mapping):
         raise TypeError(
             f'Expected example to be a Mapping, but found {type(example)}')
-    if any(allowed_message_key in example
-           for allowed_message_key in _ALLOWED_MESSAGES_KEYS):
+    if (len(example.keys()) == 1 and
+            any(allowed_message_key in example
+                for allowed_message_key in _ALLOWED_MESSAGES_KEYS)):
         return 'chat'
-    elif any(p in example for p in _ALLOWED_PROMPT_KEYS) and any(
-            r in example for r in _ALLOWED_RESPONSE_KEYS):
+    elif (len(example.keys()) == 2 and
+          any(p in example for p in _ALLOWED_PROMPT_KEYS) and
+          any(r in example for r in _ALLOWED_RESPONSE_KEYS)):
         return 'prompt_response'
     else:
-        raise UnknownExampleTypeError(example)
+        raise UnknownExampleTypeError((
+            f'Found keys {example.keys()} in dataset. Unknown example type. For prompt and response '
+            f'finetuning, the valid prompt keys are {_ALLOWED_PROMPT_KEYS} and the valid response keys are '
+            f'{_ALLOWED_RESPONSE_KEYS}. For chat finetuning, the allowed keys are {_ALLOWED_MESSAGES_KEYS}'
+        ))
 
 
 def _is_empty_or_nonexistent(dirpath: str) -> bool:
@@ -136,8 +141,6 @@ def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]):
             f'Expected dictionary to be a mapping, but found {type(dictionary)}'
         )
     desired_keys = allowed_keys.intersection(dictionary.keys())
-    if len(desired_keys) != 1:
-        raise TooManyKeysInExampleError(allowed_keys, desired_keys)
     return list(desired_keys)[0]
 
 
@@ -307,12 +310,6 @@ def _tokenize_prompt_response_formatted_example(
     prompt_keys = example_keys.intersection(_ALLOWED_PROMPT_KEYS)
     response_keys = example_keys.intersection(_ALLOWED_RESPONSE_KEYS)
 
-    if len(prompt_keys) != 1:
-        raise TooManyKeysInExampleError(_ALLOWED_PROMPT_KEYS, prompt_keys)
-
-    if len(response_keys) != 1:
-        raise TooManyKeysInExampleError(_ALLOWED_RESPONSE_KEYS, response_keys)
-
     prompt_key = prompt_keys.pop()
     response_key = response_keys.pop()
     prompt = example[prompt_key]
@@ -366,7 +363,7 @@ def tokenize_formatted_example(
         return _tokenize_prompt_response_formatted_example(
             prompt_response_example, tokenizer)
     else:
-        raise UnknownExampleTypeError(example)
+        raise NotImplementedError
 
 
 def is_valid_ift_example(max_seq_len: int, target_prompts: str,

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """Custom exceptions for the LLMFoundry."""
-from collections.abc import Mapping
 from typing import Any, Dict, List
 
 
@@ -42,19 +41,7 @@ def __init__(self, dataset_name: str, split: str,
 class UnknownExampleTypeError(KeyError):
     """Error thrown when an unknown example type is used in a task."""
 
-    def __init__(self, example: Mapping) -> None:
-        self.example = example
-        message = f'Unknown example type {example=}'
-        super().__init__(message)
-
-
-class TooManyKeysInExampleError(ValueError):
-    """Error thrown when a data sample has too many keys."""
-
-    def __init__(self, desired_keys: set[str], keys: set[str]) -> None:
-        self.desired_keys = desired_keys
-        self.keys = keys
-        message = f'Data sample has {len(keys)} keys in `allowed_keys`: {desired_keys} Please specify exactly one. Provided keys: {keys}'
+    def __init__(self, message: str) -> None:
         super().__init__(message)
 
 

diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
@@ -790,10 +790,10 @@ def test_malformed_data(
                                       match='Expected response to be')
     if add_unknown_example_type:
         error_context = pytest.raises(UnknownExampleTypeError,
-                                      match='Unknown example type')
+                                      match=r'.*Unknown example type')
     if add_too_many_example_keys:
-        error_context = pytest.raises(TooManyKeysInExampleError,
-                                      match='Please specify exactly one.')
+        error_context = pytest.raises(UnknownExampleTypeError,
+                                      match=r'.*Unknown example type')
 
     with error_context:
         dl = build_finetuning_dataloader(cfg, tokenizer,