diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
index 632a79dac9..702202b091 100644
--- a/tests/data/test_template_tokenization.py
+++ b/tests/data/test_template_tokenization.py
@@ -252,7 +252,7 @@ def test_multi_turn_chat_slicing(tokenizer_name: str, messages_format: bool):
 def test_tokenize_no_labels_bos_pr():
     # This tokenizer automatically adds bos tokens
     tokenizer = transformers.AutoTokenizer.from_pretrained(
-        'mistralai/Mixtral-8x7B-v0.1')
+        'ai21labs/Jamba-v0.1', add_bos_token=True)
 
     example = {'prompt': 'prompt', 'response': 'response'}
 
diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py
index 6e7838e7ba..712e515653 100644
--- a/tests/models/hf/test_fsdp_weight_tying.py
+++ b/tests/models/hf/test_fsdp_weight_tying.py
@@ -33,7 +33,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path,
                            init_device: str):
     model_cfg = {
         'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+        'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
         'config_overrides': {
             'num_hidden_layers': 2,
             'hidden_size': 32,
@@ -43,7 +43,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path,
         'pretrained': False,
         'init_device': init_device,
     }
-    tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+    tokenizer_name = 'codellama/CodeLlama-7b-hf'
 
     assert model_cfg is not None
     assert tokenizer_name is not None
diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py
index d8bea33dd4..a17840b4ca 100644
--- a/tests/models/hf/test_hf_peft_wrapping.py
+++ b/tests/models/hf/test_hf_peft_wrapping.py
@@ -17,8 +17,8 @@
 
 
 def test_peft_wraps():
-    mistral_cfg = transformers.AutoConfig.from_pretrained(
-        'mistralai/Mistral-7B-v0.1', num_hidden_layers=2)
+    mistral_cfg = transformers.AutoConfig.from_pretrained('mosaicml/mpt-7b',
+                                                          num_hidden_layers=2)
     mistral = transformers.AutoModelForCausalLM.from_config(mistral_cfg)
     mistral = get_peft_model(mistral, LoraConfig())
     prepare_hf_model_for_fsdp(mistral, 'cpu')
@@ -51,7 +51,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path,
                          init_device: str):
     model_cfg = {
         'name': 'hf_causal_lm',
-        'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
+        'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
         'config_overrides': {
             'num_hidden_layers': 2,
             'hidden_size': 32,
@@ -60,7 +60,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path,
         'pretrained': False,
         'init_device': init_device,
     }
-    tokenizer_name = 'mistralai/Mistral-7B-v0.1'
+    tokenizer_name = 'codellama/CodeLlama-7b-hf'
 
     assert model_cfg is not None
     assert tokenizer_name is not None
diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py
index 1e8ec2383d..2b1310f519 100644
--- a/tests/models/layers/test_huggingface_flash.py
+++ b/tests/models/layers/test_huggingface_flash.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
-import os
 
 import pytest
 from composer.core.precision import get_precision_context
@@ -15,18 +14,14 @@
 
 @pytest.mark.gpu
 @pytest.mark.world_size(2)
-@pytest.mark.parametrize('model_name', ['llama2', 'mistral'])
+@pytest.mark.parametrize('model_name', ['codellama', 'mistral'])
 @pytest.mark.parametrize('use_flash_attention_2', [True, False])
 @pytest.mark.parametrize('init_device', ['cpu', 'mixed', 'meta'])
 def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str):
-    if model_name == 'llama2':
-        if 'HUGGING_FACE_HUB_TOKEN' not in os.environ:
-            pytest.skip(
-                'The CI cluster does not have access to the Llama models, so skip this test.'
-            )
+    if model_name == 'codellama':
         model_cfg = {
             'name': 'hf_causal_lm',
-            'pretrained_model_name_or_path': 'meta-llama/Llama-2-7b-hf',
+            'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
             'config_overrides': {
                 'num_hidden_layers': 2,
                 'intermediate_size': 64,
@@ -43,25 +38,6 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str):
         flash_attn_class = LlamaFlashAttention2 if use_flash_attention_2 else LlamaAttention
         attention_layers_attr = 'model.model.layers'
         attention_attr = 'self_attn'
-    elif model_name == 'mistral':
-        model_cfg = {
-            'name': 'hf_causal_lm',
-            'pretrained_model_name_or_path': 'mistralai/Mistral-7B-v0.1',
-            'config_overrides': {
-                'num_hidden_layers': 2,
-                'intermediate_size': 64,
-                'hidden_size': 64,
-            },
-            'pretrained': False,
-            'init_device': 'cpu',
-        }
-
-        tokenizer_name = 'mistralai/Mistral-7B-v0.1'
-        from transformers.models.mistral.modeling_mistral import (
-            MistralAttention, MistralFlashAttention2)
-        flash_attn_class = MistralFlashAttention2 if use_flash_attention_2 else MistralAttention
-        attention_layers_attr = 'model.model.layers'
-        attention_attr = 'self_attn'
     else:
         raise ValueError(f'Unknown model: {model_name}')