From 84b6410205647b2011d657c2342f91783214e040 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 18 Apr 2024 12:49:59 -0700
Subject: [PATCH] Update tests to not rely on mistral (#1117)

---
 tests/data/test_template_tokenization.py      |  2 +-
 tests/models/hf/test_fsdp_weight_tying.py     |  4 +--
 tests/models/hf/test_hf_peft_wrapping.py      | 18 +++++-----
 tests/models/layers/test_huggingface_flash.py | 33 +++----------------
 4 files changed, 17 insertions(+), 40 deletions(-)

diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
index 632a79dac9..702202b091 100644
--- a/tests/data/test_template_tokenization.py
+++ b/tests/data/test_template_tokenization.py
@@ -252,7 +252,7 @@ def test_multi_turn_chat_slicing(tokenizer_name: str, messages_format: bool):
 def test_tokenize_no_labels_bos_pr():
     # This tokenizer automatically adds bos tokens
     tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mistralai/Mixtral-8x7B-v0.1')
+        'ai21labs/Jamba-v0.1', add_bos_token=True)
 
     example = {'prompt': 'prompt', 'response': 'response'}
 
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 6e7838e7ba..712e515653 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -33,7 +33,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path,
                            init_device: str):
     model_cfg = {
         'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+        'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
         'config_overrides': {
             'num_hidden_layers': 2,
             'hidden_size': 32,
@@ -43,7 +43,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path,
         'pretrained': False,
         'init_device': init_device,
     }
-    tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+    tokenizer_name = 'codellama/CodeLlama-7b-hf'
 
     assert model_cfg is not None
     assert tokenizer_name is not None
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index d8bea33dd4..7fe886ffe3 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -17,13 +17,15 @@
 
 
 def test_peft_wraps():
-    mistral_cfg = transformers.AutoConfig.from_pretrained(
-        'mistralai/Mistral-7B-v0.1', num_hidden_layers=2)
-    mistral = transformers.AutoModelForCausalLM.from_config(mistral_cfg)
-    mistral = get_peft_model(mistral, LoraConfig())
-    prepare_hf_model_for_fsdp(mistral, 'cpu')
+    mpt_cfg = transformers.AutoConfig.from_pretrained('mosaicml/mpt-7b',
+                                                      n_layers=2,
+                                                      trust_remote_code=True)
+    mpt = transformers.AutoModelForCausalLM.from_config(mpt_cfg,
+                                                        trust_remote_code=True)
+    mpt = get_peft_model(mpt, LoraConfig())
+    prepare_hf_model_for_fsdp(mpt, 'cpu')
 
-    for n, m in mistral.named_modules():
+    for n, m in mpt.named_modules():
         if 'lora' in n and 'default' in n:
             has_parameters = any(True for _ in m.parameters())
             has_buffers = any(True for _ in m.buffers())
@@ -51,7 +53,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path,
                          init_device: str):
     model_cfg = {
         'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+        'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
         'config_overrides': {
             'num_hidden_layers': 2,
             'hidden_size': 32,
@@ -60,7 +62,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path,
         'pretrained': False,
         'init_device': init_device,
     }
-    tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+    tokenizer_name = 'codellama/CodeLlama-7b-hf'
 
     assert model_cfg is not None
     assert tokenizer_name is not None
diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py
index 1e8ec2383d..08891d5199 100644
--- a/tests/models/layers/test_huggingface_flash.py
+++ b/tests/models/layers/test_huggingface_flash.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
-import os
 
 import pytest
 from composer.core.precision import get_precision_context
@@ -15,53 +14,29 @@
 
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.parametrize('model_name', ['llama2', 'mistral'])
+@pytest.mark.parametrize('model_name', ['codellama'])
 @pytest.mark.parametrize('use_flash_attention_2', [True, False])
 @pytest.mark.parametrize('init_device', ['cpu', 'mixed', 'meta'])
 def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str):
-    if model_name == 'llama2':
-        if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
-            pytest.skip(
-                'The CI cluster does not have access to the Llama models, so skip this test.'
-            )
+    if model_name == 'codellama':
         model_cfg = {
             'name': 'hf_causal_lm',
-            'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf',
+            'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
             'config_overrides': {
                 'num_hidden_layers': 2,
                 'intermediate_size': 64,
                 'hidden_size': 64,
             },
-            'use_auth_token': True,
             'pretrained': False,
             'init_device': init_device,
         }
 
-        tokenizer_name = 'meta-llama/Llama-2-7b-hf'
+        tokenizer_name = 'codellama/CodeLlama-7b-hf'
         from transformers.models.llama.modeling_llama import (
             LlamaAttention, LlamaFlashAttention2)
         flash_attn_class = LlamaFlashAttention2 if use_flash_attention_2 else LlamaAttention
         attention_layers_attr = 'model.model.layers'
         attention_attr = 'self_attn'
-    elif model_name == 'mistral':
-        model_cfg = {
-            'name': 'hf_causal_lm',
-            'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
-            'config_overrides': {
-                'num_hidden_layers': 2,
-                'intermediate_size': 64,
-                'hidden_size': 64,
-            },
-            'pretrained': False,
-            'init_device': 'cpu',
-        }
-
-        tokenizer_name = 'mistralai/Mistral-7B-v0.1'
-        from transformers.models.mistral.modeling_mistral import (
-            MistralAttention, MistralFlashAttention2)
-        flash_attn_class = MistralFlashAttention2 if use_flash_attention_2 else MistralAttention
-        attention_layers_attr = 'model.model.layers'
-        attention_attr = 'self_attn'
     else:
         raise ValueError(f'Unknown model: {model_name}')