From 84b6410205647b2011d657c2342f91783214e040 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 18 Apr 2024 12:49:59 -0700 Subject: [PATCH] Update tests to not rely on mistral (#1117) --- tests/data/test_template_tokenization.py | 2 +- tests/models/hf/test_fsdp_weight_tying.py | 4 +-- tests/models/hf/test_hf_peft_wrapping.py | 18 +++++----- tests/models/layers/test_huggingface_flash.py | 33 +++---------------- 4 files changed, 17 insertions(+), 40 deletions(-) diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py index 632a79dac9..702202b091 100644 --- a/tests/data/test_template_tokenization.py +++ b/tests/data/test_template_tokenization.py @@ -252,7 +252,7 @@ def test_multi_turn_chat_slicing(tokenizer_name: str, messages_format: bool): def test_tokenize_no_labels_bos_pr(): # This tokenizer automatically adds bos tokens tokenizer = transformers.AutoTokenizer.from_pretrained( - 'mistralai/Mixtral-8x7B-v0.1') + 'ai21labs/Jamba-v0.1', add_bos_token=True) example = {'prompt': 'prompt', 'response': 'response'} diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 6e7838e7ba..712e515653 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -33,7 +33,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, init_device: str): model_cfg = { 'name': 'hf_causal_lm', - 'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1', + 'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf', 'config_overrides': { 'num_hidden_layers': 2, 'hidden_size': 32, @@ -43,7 +43,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, 'pretrained': False, 'init_device': init_device, } - tokenizer_name = 'mistralai/Mistral-7B-v0.1' + tokenizer_name = 'codellama/CodeLlama-7b-hf' assert model_cfg is not None assert tokenizer_name is not None diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index d8bea33dd4..7fe886ffe3 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -17,13 +17,15 @@ def test_peft_wraps(): - mistral_cfg = transformers.AutoConfig.from_pretrained( - 'mistralai/Mistral-7B-v0.1', num_hidden_layers=2) - mistral = transformers.AutoModelForCausalLM.from_config(mistral_cfg) - mistral = get_peft_model(mistral, LoraConfig()) - prepare_hf_model_for_fsdp(mistral, 'cpu') + mpt_cfg = transformers.AutoConfig.from_pretrained('mosaicml/mpt-7b', + n_layers=2, + trust_remote_code=True) + mpt = transformers.AutoModelForCausalLM.from_config(mpt_cfg, + trust_remote_code=True) + mpt = get_peft_model(mpt, LoraConfig()) + prepare_hf_model_for_fsdp(mpt, 'cpu') - for n, m in mistral.named_modules(): + for n, m in mpt.named_modules(): if 'lora' in n and 'default' in n: has_parameters = any(True for _ in m.parameters()) has_buffers = any(True for _ in m.buffers()) @@ -51,7 +53,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, init_device: str): model_cfg = { 'name': 'hf_causal_lm', - 'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1', + 'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf', 'config_overrides': { 'num_hidden_layers': 2, 'hidden_size': 32, @@ -60,7 +62,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, 'pretrained': False, 'init_device': init_device, } - tokenizer_name = 'mistralai/Mistral-7B-v0.1' + tokenizer_name = 'codellama/CodeLlama-7b-hf' assert model_cfg is not None assert tokenizer_name is not None diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 1e8ec2383d..08891d5199 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib -import os import pytest from composer.core.precision import get_precision_context @@ -15,53 +14,29 @@ @pytest.mark.gpu @pytest.mark.world_size(2) -@pytest.mark.parametrize('model_name', ['llama2', 'mistral']) +@pytest.mark.parametrize('model_name', ['codellama']) @pytest.mark.parametrize('use_flash_attention_2', [True, False]) @pytest.mark.parametrize('init_device', ['cpu', 'mixed', 'meta']) def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): - if model_name == 'llama2': - if 'HUGGING_FACE_HUB_TOKEN' not in os.environ: - pytest.skip( - 'The CI cluster does not have access to the Llama models, so skip this test.' - ) + if model_name == 'codellama': model_cfg = { 'name': 'hf_causal_lm', - 'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf', + 'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf', 'config_overrides': { 'num_hidden_layers': 2, 'intermediate_size': 64, 'hidden_size': 64, }, - 'use_auth_token': True, 'pretrained': False, 'init_device': init_device, } - tokenizer_name = 'meta-llama/Llama-2-7b-hf' + tokenizer_name = 'codellama/CodeLlama-7b-hf' from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaFlashAttention2) flash_attn_class = LlamaFlashAttention2 if use_flash_attention_2 else LlamaAttention attention_layers_attr = 'model.model.layers' attention_attr = 'self_attn' - elif model_name == 'mistral': - model_cfg = { - 'name': 'hf_causal_lm', - 'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1', - 'config_overrides': { - 'num_hidden_layers': 2, - 'intermediate_size': 64, - 'hidden_size': 64, - }, - 'pretrained': False, - 'init_device': 'cpu', - } - - tokenizer_name = 'mistralai/Mistral-7B-v0.1' - from transformers.models.mistral.modeling_mistral import ( - MistralAttention, MistralFlashAttention2) - flash_attn_class = MistralFlashAttention2 if use_flash_attention_2 else MistralAttention - attention_layers_attr = 'model.model.layers' - attention_attr = 'self_attn' else: raise ValueError(f'Unknown model: {model_name}')