diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb index 926b6a11aa..11bd6c0104 100644 --- a/notebooks/validate_and_tokenize_data.ipynb +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -445,10 +445,11 @@ "- The dataset is expected to consist of dictionary-like objects (key-value pairs). A check is performed to validate this structure.\n", "Each example in the dataset is examined for its compliance with the expected format.\n", "4. Key Presence Validation:\n", - "- Two sets of allowed keys are defined: _ALLOWED_RESPONSE_KEYS and _ALLOWED_PROMPT_KEYS.\n", - "- The script checks for the presence of at least one prompt key and one response key in each example.\n", + "- Allowed prompt and response keys, chat roles are defined in [llmfoundry](https://github.com/mosaicml/llm-foundry/blob/main/llmfoundry/data/finetuning/tasks.py): _ALLOWED_RESPONSE_KEYS and _ALLOWED_PROMPT_KEYS and _ALLOWED_ROLES.\n", + "- For prompt response dataset, the script checks for the presence of at least one prompt key and one response key in each example.\n", " - Prompt Validation: Each example is checked for the presence of keys defined in _ALLOWED_PROMPT_KEYS. If no valid prompt key is found, it is counted as a format error. \n", " - Response Validation: Similarly, each example is checked for the presence of keys defined in _ALLOWED_RESPONSE_KEYS. An absence of a valid response key is also counted as a format error.\n", + "- For chat formatted dataset, the script checks if the message content is formatted valid by calling [_validate_chat_formatted_example](https://github.com/mosaicml/llm-foundry/blob/cffd75e94e5c53b1b14c67cd17e0916fecfd0e16/llmfoundry/data/finetuning/tasks.py#L130) helper function.\n", "Error Reporting:\n", "\n", "If any format errors are found during the checks, they are reported.\n", @@ -935,7 +936,7 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "validate_and_tokenize_data (1)", + "notebookName": "validate_and_tokenize_data", "widgets": {} }, "kernelspec": {