diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index a0003d0571..38c9673a14 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -552,6 +552,15 @@ def _build_collate_fn( 1)], skip_special_tokens=False, clean_up_tokenization_spaces=True)) + context = torch.logical_and( + batch['attention_mask'][j] == 1, + batch['labels'][j] == _HF_IGNORE_INDEX) + print( + '\033[92m{}\033[00m\n'.format('CONTEXT: '), + tokenizer.decode(batch['input_ids'][ + j, torch.logical_and(is_subseq, context)], + skip_special_tokens=False, + clean_up_tokenization_spaces=True)) print( '\033[91m{}\033[00m\n'.format('TARGET: '), tokenizer.decode(batch['input_ids'][ @@ -569,6 +578,14 @@ def _build_collate_fn( batch['attention_mask'][j] == 1], skip_special_tokens=False, clean_up_tokenization_spaces=True)) + context = torch.logical_and( + batch['attention_mask'][j] == 1, + batch['labels'][j] == _HF_IGNORE_INDEX) + print( + '\033[92m{}\033[00m\n'.format('CONTEXT: '), + tokenizer.decode(batch['input_ids'][j, context], + skip_special_tokens=False, + clean_up_tokenization_spaces=True)) print( '\033[91m{}\033[00m\n'.format('TARGET: '), tokenizer.decode(batch['input_ids'][ diff --git a/scripts/train/train.py b/scripts/train/train.py index 6ecbc55e38..44cfc053f4 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -55,7 +55,7 @@ def validate_config(cfg: DictConfig): loaders.append(eval_loader) for loader in loaders: if loader.name == 'text': - if cfg.model.name in ['hf_t5']: + if cfg.model.name == 'hf_t5': raise ValueError( f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ f'dataloader. Only finetuning is supported.')