diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py
index 7e282dbc9d..acefd2c42d 100644
--- a/tests/models/layers/test_flash_attn.py
+++ b/tests/models/layers/test_flash_attn.py
@@ -12,12 +12,13 @@
 
 
 @pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(),
+    reason='GQA natively only supported by Flash Attention after v2.')
 @pytest.mark.parametrize('kv_n_heads', [1, 4, 8])
 def test_gqa_kv_repetition(kv_n_heads: int):
     # Test that flash attention v2 with GQA (kv_n_heads < n_heads) works the same
     # whether we repeat the kv_n_heads explicitly or flash attention v2 handles it on its own.
-    if not is_flash_v2_installed():
-        pytest.skip('GQA natively only supported by Flash Attention after v2.')
     d = 128
     n_heads = 8
     seqlen_1 = 6
@@ -82,12 +83,13 @@ def test_gqa_kv_repetition(kv_n_heads: int):
 
 
 @pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.1.2'),
+    reason=
+    'Using sequence id with flash attention requires flash attention v2.1.2 or higher.'
+)
 def test_seq_id_masking_FA_v2():
     # Test that flash attention v2 with sequence id masking works correctly.
-    if not is_flash_v2_installed(v2_version='v2.1.2'):
-        pytest.skip(
-            'Using sequence id with flash attention requires flash attention v2.1.2 or higher.'
-        )
     d = 128
     n_heads = 4
     kv_n_heads = 4
@@ -167,13 +169,13 @@ def test_seq_id_masking_FA_v2():
 
 
 @pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.3.0'),
+    reason=
+    'Sliding window attention only supported by Flash Attention after v2.3.0.')
 @pytest.mark.parametrize('sliding_window_size', [1, 4, 8])
 def test_sliding_window(sliding_window_size: int):
     # Test that sliding window attention works as expected.
-    if not is_flash_v2_installed('v2.3.0'):
-        pytest.skip(
-            'Sliding window attention only supported by Flash Attention after v2.3.0.'
-        )
     dtype = torch.bfloat16
     device = 'cuda'
     d = 128
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index 9bac6b11b7..98a556f534 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -580,9 +580,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
         'factor': 1.0,
     },
 }])
-@pytest.mark.parametrize('tie_word_embeddings', [True, False])
-def test_sequence_id_based_masking(attention_impl: str, pos_emb_config: dict,
-                                   tie_word_embeddings: bool):
+def test_sequence_id_based_masking(attention_impl: str, pos_emb_config: dict):
     # Testing the output of concatenated sequence with sequence id masking vs individual sequences.
     alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
@@ -620,7 +618,6 @@ def test_sequence_id_based_masking(attention_impl: str, pos_emb_config: dict,
             'name': 'baseline_',
             'init_std': 0.02,
         },
-        tie_word_embeddings=tie_word_embeddings,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()