diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py
index dd7f40cd19..86e49c315d 100644
--- a/llmfoundry/models/layers/attention.py
+++ b/llmfoundry/models/layers/attention.py
@@ -92,7 +92,6 @@ def scaled_multihead_dot_product_attention(
     multiquery: bool = False,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
-
     if multiquery:
         warnings.warn(
             DeprecationWarning(
@@ -219,6 +218,9 @@ def flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
+    sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
@@ -249,58 +251,65 @@ def flash_attn_fn(
 
         past_key_value = (key, value)
 
-    if attn_bias is not None:
-        # clamp to 0 necessary for torch 2.0 compile()
-        _s_q = max(0, attn_bias.size(2) - query.size(1))
-        _s_k = max(0, attn_bias.size(3) - key.size(1))
-        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
-
     if attn_bias is not None:
         raise NotImplementedError(f'attn_bias not implemented for flash attn.')
 
     batch_size, seqlen = query.shape[:2]
 
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    if attention_mask_in_length is None:
+        if key_padding_mask is None:
+            key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+        query_padding_mask = key_padding_mask[:, -query.size(1):]
+        unpadding_function = bert_padding.unpad_input
+    else:
+        key_padding_mask = attention_mask_in_length
+        query_padding_mask = attention_mask_in_length
+        unpadding_function = bert_padding.unpad_input_for_concatenated_sequences
 
-    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpadding_function(
         query, query_padding_mask)
     query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
 
-    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+    key_unpad, _, cu_seqlens_k, max_seqlen_k = unpadding_function(
         key, key_padding_mask)
     key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
 
-    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad, _, _, _ = unpadding_function(value, key_padding_mask)
     value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
 
-    # multi-query case
-    if kv_n_heads == 1:
-        # Expanding a tensor does not allocate new memory, but only creates a new
-        # view on the existing tensor where a dimension of size one is expanded
-        # to a larger size by setting the stride to 0.
-        # - pytorch docs
-        #
-        # hopefully the kernels can utilize this and we're jot just wasting BW here
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
-                                     key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
-                                         value_unpad.size(-1))
-    # grouped query case
-    elif kv_n_heads < n_heads:
-        # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
-        # We repeat each kv head by the group size number to use the underlying MHA kernels
-
-        # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
-        # we use .view to modify {key, value}_unpad appropriately
+    if (kv_n_heads < n_heads) and (not is_flash_v2_installed()) and (
+            not should_repeat_kv_for_gqa):
+        raise ValueError(
+            'For Grouped Query Attention or Multi Query Attention, should_repeat_kv_for_gqa should be set to True if not using Flash Attention v2.'
+        )
 
-        key_unpad = repeat_kv_for_gqa(
-            key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1),
-            n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
-        value_unpad = repeat_kv_for_gqa(
-            value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1),
-            n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
+    if should_repeat_kv_for_gqa:
+        # multi-query case
+        if kv_n_heads == 1:
+            # Expanding a tensor does not allocate new memory, but only creates a new
+            # view on the existing tensor where a dimension of size one is expanded
+            # to a larger size by setting the stride to 0.
+            # - pytorch docs
+            #
+            # hopefully the kernels can utilize this and we're jot just wasting BW here
+            key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
+                                         key_unpad.size(-1))
+            value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
+                                             value_unpad.size(-1))
+        # grouped query case
+        elif kv_n_heads < n_heads:
+            # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
+            # We repeat each kv head by the group size number to use the underlying MHA kernels
+
+            # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
+            # we use .view to modify {key, value}_unpad appropriately
+
+            key_unpad = repeat_kv_for_gqa(
+                key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1),
+                n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
+            value_unpad = repeat_kv_for_gqa(
+                value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1),
+                n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
 
     dropout_p = dropout_p if training else 0.0
 
@@ -331,7 +340,8 @@ def flash_attn_fn(
             dropout_p=dropout_p,
             softmax_scale=softmax_scale,
             causal=reset_is_causal,
-            return_attn_probs=needs_weights)
+            return_attn_probs=needs_weights,
+            window_size=(sliding_window_size, sliding_window_size))
     else:
         raise RuntimeError(
             'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')
@@ -490,6 +500,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__()
 
@@ -500,6 +511,7 @@ def __init__(
         self.d_model = d_model
         self.n_heads = n_heads
         self.kv_n_heads = kv_n_heads
+        self.sliding_window_size = sliding_window_size
 
         self.head_dim = d_model // n_heads
 
@@ -569,6 +581,7 @@ def forward(
         rotary_emb_w_meta_info: Optional[dict] = None,
         is_causal: bool = True,
         needs_weights: bool = False,
+        attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[
             torch.Tensor, torch.Tensor]]]:
         qkv = self.Wqkv(x)
@@ -626,6 +639,14 @@ def forward(
             query = query.view(bsz, seqlen, self.d_model)
             key = key.view(bsz, seqlen, self.kv_n_heads * self.head_dim)
 
+        extra_attn_kwargs = {}
+        if self.attn_impl == 'flash':
+            extra_attn_kwargs = {
+                'attention_mask_in_length': attention_mask_in_length,
+                'should_repeat_kv_for_gqa': not is_flash_v2_installed(),
+                'sliding_window_size': self.sliding_window_size,
+            }
+
         context, attn_weights, past_key_value = self.attn_fn(
             query,
             key,
@@ -640,6 +661,7 @@ def forward(
             dropout_p=self.attn_dropout_p,
             training=self.training,
             needs_weights=needs_weights,
+            **extra_attn_kwargs,
         )
 
         return self.out_proj(context), attn_weights, past_key_value
@@ -665,6 +687,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__(
             d_model=d_model,
@@ -679,6 +702,7 @@ def __init__(
             fc_type=fc_type,
             device=device,
             bias=bias,
+            sliding_window_size=sliding_window_size,
         )
 
 
@@ -702,6 +726,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__(
             d_model=d_model,
@@ -716,6 +741,7 @@ def __init__(
             fc_type=fc_type,
             device=device,
             bias=bias,
+            sliding_window_size=sliding_window_size,
         )
 
 
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index 6605807c6b..6db9ff22ca 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -21,6 +21,7 @@
     'softmax_scale': None,
     'prefix_lm': False,
     'attn_uses_sequence_id': False,
+    'sliding_window_size': -1,
     'alibi': False,
     'alibi_bias_max': 8,
     'rope': False,
@@ -113,6 +114,7 @@ def forward(
         attention_mask: Optional[torch.ByteTensor] = None,
         is_causal: bool = True,
         output_attentions: bool = False,
+        attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
             torch.Tensor, torch.Tensor]]]:
         a = self.norm_1(x)
@@ -124,6 +126,7 @@ def forward(
             attention_mask=attention_mask,
             is_causal=is_causal,
             needs_weights=output_attentions,
+            attention_mask_in_length=attention_mask_in_length,
         )
         x = x + self.resid_attn_dropout(b)
         m = x
diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py
index f8022808bf..47fd5ac9e5 100644
--- a/llmfoundry/models/mpt/configuration_mpt.py
+++ b/llmfoundry/models/mpt/configuration_mpt.py
@@ -91,6 +91,7 @@ def __init__(
                     When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
                     which sub-sequence each token belongs to.
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                sliding_window_size (int): Window size for sliding window local attention. Defaults to -1, which means no sliding window. Query at position i will only attend to keys between [i + seqlen_k - seqlen_q - window_size, i + seqlen_k - seqlen_q + window_size] inclusive. Only works for flash attention v2.3.0 or higher.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
                 rope (bool): Whether to use rotary positional embeddings.
@@ -221,10 +222,12 @@ def _validate_config(self) -> None:
         ]:
             raise NotImplementedError(
                 'alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
+        if self.attn_config['attn_uses_sequence_id'] and not (
+                self.attn_config['attn_impl'] in ['torch', 'triton'] or
+            (self.attn_config['attn_impl'] == 'flash' and
+             is_flash_v2_installed(v2_version='v2.1.2'))):
             raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'
+                'attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.'
             )
         if self.attn_config['rope'] and (self.attn_config['rope_impl']
                                          not in ['dail', 'hf']):
@@ -251,6 +254,12 @@ def _validate_config(self) -> None:
                 raise ImportError(
                     'If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support'
                 )
+        if self.attn_config['sliding_window_size'] != -1 and not (
+                self.attn_config['attn_impl'] == 'flash' and
+                is_flash_v2_installed(v2_version='v2.3.0')):
+            raise NotImplementedError(
+                'sliding window only implemented with flash attention v2.3.0 or higher.'
+            )
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
                 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 34b8992d3e..e2d2ee6fbc 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -132,6 +132,114 @@ def gen_rotary_embedding(rope_head_dim: int, rope_impl: str, rope_theta: int,
     raise ValueError('rope_impl needs to be either dail or hf')
 
 
+def gen_attention_mask_in_length(sequence_id: Union[None, torch.Tensor], S: int,
+                                 attn_uses_sequence_id: bool, attn_impl: str,
+                                 attention_mask: Union[torch.Tensor, None]):
+    """Generates the attention mask used for sequence masking in FA v2.
+
+    Only supports sequence id based sparse attention for no attention masking or attention masking with right padding.
+    In case of left padding:
+        1. Training with left padding is not supported in MPT (see https://github.com/mosaicml/llm-foundry/blob/1eecd4cb8e734499f77f6a35f657b8b20c0adfcb/llmfoundry/models/mpt/modeling_mpt.py#L407).
+        2. For generation with left padding, we only have a single sequence id per sample, so we don't need sequence id based sparse attention.
+
+    Args:
+        sequence_id (Union[None, torch.Tensor]): Tensor containing the sequence id for each token. Shape (batch_size, seq_len).
+        S (int): Sequence length
+        attn_uses_sequence_id (bool): Whether the attention uses sequence id based masking.
+        attn_impl (str): Attention implementation. This function is only creates attention_mask_in_length for flash attention.
+        attention_mask (Union[torch.Tensor, None]): Attention mask tensor of shape (batch_size, seq_len)
+
+    Returns:
+        attention_mask_in_length: (batch, seqlen), int, a nonzero number (e.g., 1, 2, 3, etc.) means length of concatenated sequence in b-th batch, and 0 means none. For example, if batch = 3 and seqlen = 6, the attention_mask_in_length is:
+            ```
+            [
+            [2, 3, 0, 0, 0, 0],
+            [3, 2, 0, 0, 0, 0],
+            [6, 0, 0, 0, 0, 0]
+            ]
+            ```
+        , which refers to the 3D-attention mask:
+            ```
+            [
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [0, 0, 1, 0, 0, 0],
+                [0, 0, 1, 1, 0, 0],
+                [0, 0, 1, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [0, 0, 0, 1, 0, 0],
+                [0, 0, 0, 1, 1, 0],
+                [0, 0, 0, 0, 0, 1]
+            ],
+            [
+                [1, 0, 0, 0, 0, 0],
+                [1, 1, 0, 0, 0, 0],
+                [1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 1, 0, 0],
+                [1, 1, 1, 1, 1, 0],
+                [1, 1, 1, 1, 1, 1]
+            ]
+            ]
+            ```.
+            (The description above is taken verbatim from https://github.com/Dao-AILab/flash-attention/blob/9356a1c0389660d7e231ff3163c1ac17d9e3824a/flash_attn/bert_padding.py#L125 .)
+    """
+    attention_mask_in_length = None
+    if (sequence_id is not None) and attn_uses_sequence_id and (attn_impl
+                                                                == 'flash'):
+        # Check if sequence has left padding. If yes, raise an error.
+        if (attention_mask is not None) and (attention_mask[:, 0].sum() !=
+                                             attention_mask.shape[0]):
+            raise NotImplementedError(
+                'Left padding is not supported with flash attention when attn_uses_sequence_id is set to True.'
+            )
+        if S != sequence_id.shape[-1]:
+            raise ValueError(
+                f'Sequence length ({S}) does not match length of sequences in sequence_id ({sequence_id.shape[-1]}).'
+            )
+        attention_mask_in_length = torch.nn.functional.one_hot(sequence_id)
+        if attention_mask is not None:
+            attention_mask_in_length = attention_mask_in_length.masked_fill(
+                ~attention_mask.unsqueeze(-1), 0)
+        attention_mask_in_length = attention_mask_in_length.sum(dim=1)
+        attention_mask_in_length = torch.nn.functional.pad(
+            attention_mask_in_length,
+            (0, S - attention_mask_in_length.shape[-1]),
+            mode='constant',
+            value=0)
+
+    return attention_mask_in_length
+
+
+def apply_sequence_id(attn_bias: torch.Tensor, sequence_id: torch.LongTensor,
+                      max_seq_len: int) -> torch.Tensor:
+    seq_len = sequence_id.shape[-1]
+    if seq_len > max_seq_len:
+        raise ValueError(
+            f'sequence_id sequence length cannot exceed max_seq_len={max_seq_len}'
+        )
+
+    # select seq_len subset of attn mask
+    attn_bias = attn_bias[..., :seq_len, :seq_len]
+
+    # Restrict attention to tokens that share the same value
+    # in sequence_id
+    cannot_attend = torch.logical_not(
+        torch.eq(
+            sequence_id.view(-1, seq_len, 1),
+            sequence_id.view(-1, 1, seq_len),
+        )).unsqueeze(1)
+    min_val = torch.finfo(attn_bias.dtype).min
+    attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+
+    return attn_bias
+
+
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
     base_model_prefix = 'model'
@@ -286,7 +394,8 @@ def _attn_bias(
         # If using torch or triton, we incorporate sequence_id (if appropriate)
         if self.attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)  # pyright
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+            attn_bias = apply_sequence_id(attn_bias, sequence_id,
+                                          self.config.max_seq_len)
 
         # If using torch or triton, we incorporate attention_mask. This will output
         # None in place of attention_mask since it will not be further needed in the
@@ -343,29 +452,6 @@ def _apply_prefix_mask(self, attn_bias: torch.Tensor,
 
         return attn_bias
 
-    def _apply_sequence_id(self, attn_bias: torch.Tensor,
-                           sequence_id: torch.LongTensor) -> torch.Tensor:
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
-            )
-
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-
-        # Restrict attention to tokens that share the same value
-        # in sequence_id
-        cannot_attend = torch.logical_not(
-            torch.eq(
-                sequence_id.view(-1, seq_len, 1),
-                sequence_id.view(-1, 1, seq_len),
-            )).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-
-        return attn_bias
-
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -515,7 +601,12 @@ def forward(
             prefix_mask=prefix_mask,
             sequence_id=sequence_id,
         )
-
+        attention_mask_in_length = gen_attention_mask_in_length(
+            sequence_id=sequence_id,
+            S=S,
+            attn_uses_sequence_id=self.attn_uses_sequence_id,
+            attn_impl=self.attn_impl,
+            attention_mask=attention_mask)
         # initialize the past key values cache if it should be used
         presents = () if use_cache else None
         if use_cache and past_key_values is None:
@@ -538,6 +629,7 @@ def forward(
                 attention_mask=attention_mask,
                 is_causal=self.is_causal,
                 output_attentions=bool(output_attentions),
+                attention_mask_in_length=attention_mask_in_length,
             )
             if presents is not None:
                 presents += (present,)
diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py
new file mode 100644
index 0000000000..acefd2c42d
--- /dev/null
+++ b/tests/models/layers/test_flash_attn.py
@@ -0,0 +1,255 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+
+import pytest
+import torch
+
+from llmfoundry.models.layers.attention import (flash_attn_fn,
+                                                is_flash_v2_installed,
+                                                triton_flash_attn_fn)
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(),
+    reason='GQA natively only supported by Flash Attention after v2.')
+@pytest.mark.parametrize('kv_n_heads', [1, 4, 8])
+def test_gqa_kv_repetition(kv_n_heads: int):
+    # Test that flash attention v2 with GQA (kv_n_heads < n_heads) works the same
+    # whether we repeat the kv_n_heads explicitly or flash attention v2 handles it on its own.
+    d = 128
+    n_heads = 8
+    seqlen_1 = 6
+    bsz = 2
+
+    query_1 = torch.randn(bsz, seqlen_1, n_heads * d).to(torch.bfloat16).cuda()
+    query_1.requires_grad = True
+    key_1 = torch.randn(bsz, seqlen_1, kv_n_heads * d).to(torch.bfloat16).cuda()
+    key_1.requires_grad = True
+    value_1 = torch.randn(bsz, seqlen_1,
+                          kv_n_heads * d).to(torch.bfloat16).cuda()
+    value_1.requires_grad = True
+
+    output_1, _, _ = flash_attn_fn(query=query_1,
+                                   key=key_1,
+                                   value=value_1,
+                                   n_heads=n_heads,
+                                   kv_n_heads=kv_n_heads,
+                                   past_key_value=None,
+                                   softmax_scale=1 / math.sqrt(d),
+                                   attn_bias=None,
+                                   key_padding_mask=None,
+                                   is_causal=True,
+                                   dropout_p=0.0,
+                                   training=False,
+                                   needs_weights=False,
+                                   multiquery=False,
+                                   attention_mask_in_length=None,
+                                   should_repeat_kv_for_gqa=True)
+
+    output_1.sum().backward()
+
+    query_2 = query_1.detach().clone()
+    query_2.requires_grad = True
+    key_2 = key_1.detach().clone()
+    key_2.requires_grad = True
+    value_2 = value_1.detach().clone()
+    value_2.requires_grad = True
+
+    output_2, _, _ = flash_attn_fn(query=query_2,
+                                   key=key_2,
+                                   value=value_2,
+                                   n_heads=n_heads,
+                                   kv_n_heads=kv_n_heads,
+                                   past_key_value=None,
+                                   softmax_scale=1 / math.sqrt(d),
+                                   attn_bias=None,
+                                   key_padding_mask=None,
+                                   is_causal=True,
+                                   dropout_p=0.0,
+                                   training=False,
+                                   needs_weights=False,
+                                   multiquery=False,
+                                   attention_mask_in_length=None,
+                                   should_repeat_kv_for_gqa=False)
+
+    output_2.sum().backward()
+    assert torch.allclose(output_1, output_2)
+    assert torch.allclose(query_1.grad, query_2.grad)  # type: ignore
+    assert torch.allclose(key_1.grad, key_2.grad)  # type: ignore
+    assert torch.allclose(value_1.grad, value_2.grad)  # type: ignore
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.1.2'),
+    reason=
+    'Using sequence id with flash attention requires flash attention v2.1.2 or higher.'
+)
+def test_seq_id_masking_FA_v2():
+    # Test that flash attention v2 with sequence id masking works correctly.
+    d = 128
+    n_heads = 4
+    kv_n_heads = 4
+    seqlen_1 = 6
+    bsz = 2
+
+    query_1 = torch.randn(bsz, seqlen_1, n_heads * d).to(torch.bfloat16).cuda()
+    query_1.requires_grad = True
+    key_1 = torch.randn(bsz, seqlen_1, kv_n_heads * d).to(torch.bfloat16).cuda()
+    key_1.requires_grad = True
+    value_1 = torch.randn(bsz, seqlen_1,
+                          kv_n_heads * d).to(torch.bfloat16).cuda()
+    value_1.requires_grad = True
+
+    seq_ranges = [
+        (0, 3), (3, 5), (5, 6)
+    ]  # Each batch has 3 sequences of length 3, 2, and 1 respectively.
+    attention_mask_in_length_1 = torch.tensor([[3, 2, 1, 0, 0, 0],
+                                               [3, 2, 1, 0, 0,
+                                                0]]).to(torch.int64).cuda()
+
+    output_1, _, _ = flash_attn_fn(
+        query=query_1,
+        key=key_1,
+        value=value_1,
+        n_heads=n_heads,
+        kv_n_heads=kv_n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        attn_bias=None,
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        multiquery=False,
+        attention_mask_in_length=attention_mask_in_length_1)
+
+    output_1.sum().backward()
+
+    for seq_range in seq_ranges:
+        query_2 = query_1.detach().clone()[:, seq_range[0]:seq_range[1], :]
+        query_2.requires_grad = True
+        key_2 = key_1.detach().clone()[:, seq_range[0]:seq_range[1], :]
+        key_2.requires_grad = True
+        value_2 = value_1.detach().clone()[:, seq_range[0]:seq_range[1], :]
+        value_2.requires_grad = True
+
+        output_2, _, _ = flash_attn_fn(query=query_2,
+                                       key=key_2,
+                                       value=value_2,
+                                       n_heads=n_heads,
+                                       kv_n_heads=kv_n_heads,
+                                       past_key_value=None,
+                                       softmax_scale=1 / math.sqrt(d),
+                                       attn_bias=None,
+                                       key_padding_mask=None,
+                                       is_causal=True,
+                                       dropout_p=0.0,
+                                       training=False,
+                                       needs_weights=False,
+                                       multiquery=False,
+                                       attention_mask_in_length=None)
+
+        output_2.sum().backward()
+        assert torch.allclose(output_1[:, seq_range[0]:seq_range[1], :],
+                              output_2)
+        assert torch.allclose(
+            query_1.grad[:, seq_range[0]:seq_range[1], :],  # type: ignore
+            query_2.grad)  # type: ignore
+        assert torch.allclose(
+            key_1.grad[:, seq_range[0]:seq_range[1], :],  # type: ignore
+            key_2.grad)  # type: ignore
+        assert torch.allclose(
+            value_1.grad[:, seq_range[0]:seq_range[1], :],  # type: ignore
+            value_2.grad)  # type: ignore
+
+
+@pytest.mark.gpu
+@pytest.mark.skipif(
+    not is_flash_v2_installed(v2_version='v2.3.0'),
+    reason=
+    'Sliding window attention only supported by Flash Attention after v2.3.0.')
+@pytest.mark.parametrize('sliding_window_size', [1, 4, 8])
+def test_sliding_window(sliding_window_size: int):
+    # Test that sliding window attention works as expected.
+    dtype = torch.bfloat16
+    device = 'cuda'
+    d = 128
+    n_heads = 8
+    seqlen_1 = 8
+    bsz = 2
+
+    query_1 = torch.randn(bsz, seqlen_1, n_heads * d).to(dtype=dtype,
+                                                         device=device)
+    query_1.requires_grad = True
+    key_1 = torch.randn(bsz, seqlen_1, n_heads * d).to(dtype=dtype,
+                                                       device=device)
+    key_1.requires_grad = True
+    value_1 = torch.randn(bsz, seqlen_1, n_heads * d).to(dtype=dtype,
+                                                         device=device)
+    value_1.requires_grad = True
+
+    output_1, _, _ = flash_attn_fn(query=query_1,
+                                   key=key_1,
+                                   value=value_1,
+                                   n_heads=n_heads,
+                                   kv_n_heads=n_heads,
+                                   past_key_value=None,
+                                   softmax_scale=1 / math.sqrt(d),
+                                   attn_bias=None,
+                                   key_padding_mask=None,
+                                   is_causal=True,
+                                   dropout_p=0.0,
+                                   training=False,
+                                   needs_weights=False,
+                                   multiquery=False,
+                                   attention_mask_in_length=None,
+                                   should_repeat_kv_for_gqa=True,
+                                   sliding_window_size=sliding_window_size)
+
+    output_1.sum().backward()
+
+    query_2 = query_1.detach().clone()
+    query_2.requires_grad = True
+    key_2 = key_1.detach().clone()
+    key_2.requires_grad = True
+    value_2 = value_1.detach().clone()
+    value_2.requires_grad = True
+
+    attn_bias_2 = torch.zeros(1, 1, seqlen_1, seqlen_1).to(dtype=dtype,
+                                                           device=device)
+
+    window_mask_2 = torch.tril(
+        torch.ones(seqlen_1, seqlen_1), diagonal=-(sliding_window_size + 1)).to(
+            dtype=dtype, device=device) * torch.finfo(attn_bias_2.dtype).min
+    attn_bias_2 = attn_bias_2 + window_mask_2
+    output_2, _, _ = triton_flash_attn_fn(
+        query=query_2,
+        key=key_2,
+        value=value_2,
+        n_heads=n_heads,
+        kv_n_heads=n_heads,
+        past_key_value=None,
+        softmax_scale=1 / math.sqrt(d),
+        attn_bias=attn_bias_2,
+        key_padding_mask=None,
+        is_causal=True,
+        dropout_p=0.0,
+        training=False,
+        needs_weights=False,
+        multiquery=False,
+    )
+
+    output_2.sum().backward()
+
+    assert torch.allclose(output_1, output_2)
+    assert torch.norm(query_2.grad - query_1.grad  # type: ignore
+                     ) <= 1e-2 + 1e-2 * torch.norm(query_2.grad)
+    assert torch.norm(key_2.grad - key_1.grad  # type: ignore
+                     ) <= 1e-2 + 1e-2 * torch.norm(key_2.grad)
+    assert torch.norm(value_2.grad - value_1.grad  # type: ignore
+                     ) <= 1e-2 + 1e-2 * torch.norm(value_2.grad)
diff --git a/tests/models/layers/test_flash_triton_torch.py b/tests/models/layers/test_flash_triton_torch.py
index e140f678bc..454fda311d 100644
--- a/tests/models/layers/test_flash_triton_torch.py
+++ b/tests/models/layers/test_flash_triton_torch.py
@@ -7,7 +7,9 @@
 
 from llmfoundry.models.layers import attention
 from llmfoundry.models.layers.attention import is_flash_v2_installed
-from llmfoundry.models.mpt.modeling_mpt import gen_rotary_embedding
+from llmfoundry.models.mpt.modeling_mpt import (apply_sequence_id,
+                                                gen_attention_mask_in_length,
+                                                gen_rotary_embedding)
 
 
 def allclose_helper(t0: torch.Tensor,
@@ -54,6 +56,7 @@ def allclose_helper(t0: torch.Tensor,
 @pytest.mark.parametrize(
     'attn_type',
     ['multihead_attention', 'multiquery_attention', 'grouped_query_attention'])
+@pytest.mark.parametrize('attn_uses_sequence_id', [True, False])
 @pytest.mark.parametrize('pad_attention_mask', [True, False])
 def test_attn_impl(attn_impl_0: str,
                    attn_impl_1: str,
@@ -61,6 +64,7 @@ def test_attn_impl(attn_impl_0: str,
                    qk_ln: bool,
                    pos_emb_config: dict,
                    attn_type: str,
+                   attn_uses_sequence_id: bool,
                    pad_attention_mask: bool,
                    device: str = 'cuda'):
     """Compare all attn impl with each other.
@@ -77,6 +81,16 @@ def test_attn_impl(attn_impl_0: str,
                  == 'dail') and (not is_flash_v2_installed()):
         pytest.skip('dail implementation of rope requires flash attention 2.')
 
+    if attn_uses_sequence_id and (
+            attn_impl_0 == 'flash' or attn_impl_1
+            == 'flash') and (not is_flash_v2_installed(v2_version='v2.1.2')):
+        pytest.skip(
+            'Using sequence id with flash attention requires flash attention v2.1.2 or higher.'
+        )
+
+    if not (alibi or rope) and attn_uses_sequence_id:
+        pytest.skip('attn_uses_sequence_id requires alibi or rope.')
+
     cfg = om.create({
         'attn_impl': 'flash',
         'd_model': 64,
@@ -91,6 +105,14 @@ def test_attn_impl(attn_impl_0: str,
     if attn_type == 'grouped_query_attention':
         cfg.kv_n_heads = 2
 
+    sequence_id = None
+    if attn_uses_sequence_id:
+        assert n == 2
+        assert s >= 4
+        sequence_id = torch.LongTensor([[0] * 2 + [1] * (s - 2),
+                                        [0] * 4 + [1] * (s - 4)
+                                       ]).to(device=device)
+
     cfg.attn_impl = attn_impl_0
     attn0 = attention.ATTN_CLASS_REGISTRY[attn_type](**cfg).to(device)
     cfg.attn_impl = attn_impl_1
@@ -113,7 +135,7 @@ def gen_bias(attn_impl: str):
                                        s,
                                        alibi,
                                        prefix_lm=False,
-                                       use_sequence_id=False,
+                                       use_sequence_id=attn_uses_sequence_id,
                                        causal=causal)
         if bs is not None:
             attn_bias = torch.zeros(*bs, device=device)
@@ -126,17 +148,35 @@ def gen_bias(attn_impl: str):
                 alibi=alibi,
                 alibi_bias_max=8,
             )
+        if attn_impl != 'flash' and attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)  # pyright
+            attn_bias = apply_sequence_id(
+                attn_bias,
+                sequence_id,  # type: ignore
+                s)
 
         return attn_bias
 
+    attention_mask_in_length_0 = gen_attention_mask_in_length(
+        sequence_id=sequence_id,
+        S=s,
+        attn_uses_sequence_id=attn_uses_sequence_id,
+        attn_impl=attn_impl_0,
+        attention_mask=attention_mask)
+    attention_mask_in_length_1 = gen_attention_mask_in_length(
+        sequence_id=sequence_id,
+        S=s,
+        attn_uses_sequence_id=attn_uses_sequence_id,
+        attn_impl=attn_impl_1,
+        attention_mask=attention_mask)
+
     x0 = torch.randn(n, s, f).to(device)
     x1 = x0.clone().detach()
     x0.requires_grad = True
     x1.requires_grad = True
 
     with torch.autocast(x0.device.type):
-        attn_bias = gen_bias(attn0.attn_impl)
-
+        attn_bias_0 = gen_bias(attn_impl_0)
         rotary_emb_w_meta_info = None
         if rope:
             rotary_embedding = gen_rotary_embedding(
@@ -165,17 +205,19 @@ def gen_bias(attn_impl: str):
 
         y0, _, _ = attn0(x0,
                          past_key_value=None,
-                         attn_bias=attn_bias,
+                         attn_bias=attn_bias_0,
                          attention_mask=attention_mask,
                          rotary_emb_w_meta_info=rotary_emb_w_meta_info,
-                         is_causal=True)
-        attn_bias = gen_bias(attn1.attn_impl)
+                         is_causal=True,
+                         attention_mask_in_length=attention_mask_in_length_0)
+        attn_bias_1 = gen_bias(attn_impl_1)
         y1, _, _ = attn1(x1,
                          past_key_value=None,
-                         attn_bias=attn_bias,
+                         attn_bias=attn_bias_1,
                          attention_mask=attention_mask,
                          rotary_emb_w_meta_info=rotary_emb_w_meta_info,
-                         is_causal=True)
+                         is_causal=True,
+                         attention_mask_in_length=attention_mask_in_length_1)
         y0 *= attention_mask.unsqueeze(-1)
         y1 *= attention_mask.unsqueeze(-1)
 
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index acb2074ae9..98a556f534 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -555,6 +555,116 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
         assert block.resid_ffn_dropout.p == 0.2
 
 
+@pytest.mark.gpu
+@pytest.mark.parametrize('attention_impl', ['flash', 'triton', 'torch'])
+@pytest.mark.parametrize('pos_emb_config', [{
+    'alibi': True,
+    'rope': False
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'dail',
+    'rope_dail_config': {
+        'type': 'original',
+        'pos_idx_in_fp32': True,
+        'xpos_scale_base': 512,
+    },
+}, {
+    'alibi': False,
+    'rope': True,
+    'rope_theta': 10000,
+    'rope_impl': 'hf',
+    'rope_hf_config': {
+        'type': 'no_scaling',
+        'factor': 1.0,
+    },
+}])
+def test_sequence_id_based_masking(attention_impl: str, pos_emb_config: dict):
+    # Testing the output of concatenated sequence with sequence id masking vs individual sequences.
+    alibi = pos_emb_config['alibi']
+    if alibi and attention_impl == 'flash':
+        pytest.skip(f'alibi only implemented with torch and triton attention.')
+
+    rope = pos_emb_config['rope']
+    if rope and pos_emb_config[
+            'rope_impl'] == 'dail' and not is_flash_v2_installed():
+        pytest.skip(
+            f'dail implementation of rope requires gpu and flash attention 2.')
+
+    if attention_impl == 'flash' and (
+            not is_flash_v2_installed(v2_version='v2.1.2')):
+        pytest.skip(
+            'Using sequence id with flash attention requires flash attention v2.1.2 or higher.'
+        )
+
+    composer_device = get_device(None)
+
+    hf_config = MPTConfig(
+        init_device='cpu',
+        d_model=128,
+        n_heads=1,
+        n_layers=2,
+        expansion_ratio=2,
+        max_seq_len=2048,
+        emb_pdrop=0.1,
+        resid_pdrop=0.2,
+        attn_config={
+            'attn_impl': attention_impl,
+            'attn_uses_sequence_id': True,
+            **pos_emb_config,
+        },
+        init_config={
+            'name': 'baseline_',
+            'init_std': 0.02,
+        },
+    )
+    mpt = MPTForCausalLM(hf_config)
+    mpt.eval()
+    mpt = composer_device.module_to_device(mpt)
+
+    with get_precision_context('amp_bf16' if composer_device.name ==
+                               'gpu' else 'fp32'):
+        # padding on the right side of the input
+        concatenated_seq_ids = torch.tensor([[11274, 16390, 11, 4332, 323, 423],
+                                             [2342, 12, 111, 123, 50256, 342]])
+        concatenated_seq_ids = composer_device.tensor_to_device(
+            concatenated_seq_ids)
+
+        sequence_id = torch.tensor([[0, 0, 0, 1, 2, 2], [0, 0, 0, 1, 2, 2]])
+        sequence_id = composer_device.tensor_to_device(sequence_id)
+
+        first_seq_ids = torch.tensor([[11274, 16390, 11], [2342, 12, 111]])
+        first_seq_ids = composer_device.tensor_to_device(first_seq_ids)
+
+        second_seq_ids = torch.tensor([[4332], [123]])
+        second_seq_ids = composer_device.tensor_to_device(second_seq_ids)
+
+        third_seq_ids = torch.tensor([[323, 423], [50256, 342]])
+        third_seq_ids = composer_device.tensor_to_device(third_seq_ids)
+
+        concatenated_seq_output = mpt(concatenated_seq_ids,
+                                      sequence_id=sequence_id).logits
+        first_seq_output = mpt(first_seq_ids).logits
+        second_seq_output = mpt(second_seq_ids).logits
+        third_seq_output = mpt(third_seq_ids).logits
+
+        assert torch.allclose(concatenated_seq_output[:, :3],
+                              first_seq_output,
+                              atol=2e-6 if attention_impl == 'torch' else 1e-8)
+        assert torch.allclose(concatenated_seq_output[:, 3:4],
+                              second_seq_output,
+                              atol=2e-6 if attention_impl == 'torch' else 1e-8)
+        atol = 1e-8
+        if attention_impl == 'torch':
+            atol = 2e-6
+        elif pos_emb_config['rope']:
+            atol = 2e-2
+        assert torch.allclose(concatenated_seq_output[:, 4:6],
+                              third_seq_output,
+                              atol=atol)
+
+
 @pytest.mark.parametrize('attention_impl', [
     'torch',
     pytest.param('flash', marks=pytest.mark.gpu),