mosaicml · ShashankMosaicML · Dec 4, 2023 · Oct 9, 2023 · Oct 27, 2023 · Nov 6, 2023
@@ -90,8 +90,13 @@ def scaled_multihead_dot_product_attention(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
+    sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
+    del query_attention_mask_in_length, key_attention_mask_in_length, should_repeat_kv_for_gqa, sliding_window_size
 
     if multiquery:
         warnings.warn(
@@ -219,6 +224,10 @@ def flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
+    sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
@@ -260,47 +269,69 @@ def flash_attn_fn(
 
     batch_size, seqlen = query.shape[:2]
 
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    if query_attention_mask_in_length is None:
+        if key_padding_mask is None:
+            key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+        query_padding_mask = key_padding_mask[:, -query.size(1):]
 
-    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
-        query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+        query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+            query, query_padding_mask)
+        query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
 
-    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
-        key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+        key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+            key, key_padding_mask)
+        key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
 
-    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
-
-    # multi-query case
-    if kv_n_heads == 1:
-        # Expanding a tensor does not allocate new memory, but only creates a new
-        # view on the existing tensor where a dimension of size one is expanded
-        # to a larger size by setting the stride to 0.
-        # - pytorch docs
-        #
-        # hopefully the kernels can utilize this and we're jot just wasting BW here
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
-                                     key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
-                                         value_unpad.size(-1))
-    # grouped query case
-    elif kv_n_heads < n_heads:
-        # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
-        # We repeat each kv head by the group size number to use the underlying MHA kernels
-
-        # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
-        # we use .view to modify {key, value}_unpad appropriately
-
-        key_unpad = repeat_kv_for_gqa(
-            key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1),
-            n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
-        value_unpad = repeat_kv_for_gqa(
-            value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1),
-            n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
+        value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+        value_unpad = rearrange(value_unpad,
+                                'nnz (h d) -> nnz h d',
+                                h=kv_n_heads)
+    else:
+        if key_attention_mask_in_length is None:
+            raise ValueError(
+                'key_attention_mask_in_length must not be None if query_attention_mask_in_length is not None.'
+            )
+        query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input_for_concatenated_sequences(
+            query, query_attention_mask_in_length)
+        query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+
+        key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input_for_concatenated_sequences(
+            key, key_attention_mask_in_length)
+        key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+
+        value_unpad, _, _, _ = bert_padding.unpad_input_for_concatenated_sequences(
+            value, key_attention_mask_in_length)
+        value_unpad = rearrange(value_unpad,
+                                'nnz (h d) -> nnz h d',
+                                h=kv_n_heads)
+
+    if should_repeat_kv_for_gqa:
+        # multi-query case
+        if kv_n_heads == 1:
+            # Expanding a tensor does not allocate new memory, but only creates a new
+            # view on the existing tensor where a dimension of size one is expanded
+            # to a larger size by setting the stride to 0.
+            # - pytorch docs
+            #
+            # hopefully the kernels can utilize this and we're jot just wasting BW here
+            key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
+                                         key_unpad.size(-1))
+            value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
+                                             value_unpad.size(-1))
+        # grouped query case
+        elif kv_n_heads < n_heads:
+            # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
+            # We repeat each kv head by the group size number to use the underlying MHA kernels
+
+            # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
+            # we use .view to modify {key, value}_unpad appropriately
+
+            key_unpad = repeat_kv_for_gqa(
+                key_unpad.view(1, key_unpad.size(0), kv_n_heads, -1),
+                n_heads // kv_n_heads).view(key_unpad.size(0), n_heads, -1)
+            value_unpad = repeat_kv_for_gqa(
+                value_unpad.view(1, value_unpad.size(0), kv_n_heads, -1),
+                n_heads // kv_n_heads).view(value_unpad.size(0), n_heads, -1)
 
     dropout_p = dropout_p if training else 0.0
 
@@ -331,7 +362,8 @@ def flash_attn_fn(
             dropout_p=dropout_p,
             softmax_scale=softmax_scale,
             causal=reset_is_causal,
-            return_attn_probs=needs_weights)
+            return_attn_probs=needs_weights,
+            window_size=(sliding_window_size, sliding_window_size))
     else:
         raise RuntimeError(
             'flash-attn==1.0.9 or flash-attn==2.3.2 is required.')
@@ -357,8 +389,13 @@ def triton_flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
+    sliding_window_size: int = -1,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
+    del query_attention_mask_in_length, key_attention_mask_in_length, should_repeat_kv_for_gqa, sliding_window_size
     try:
         from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
     except:
@@ -490,6 +527,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__()
 
@@ -500,6 +538,7 @@ def __init__(
         self.d_model = d_model
         self.n_heads = n_heads
         self.kv_n_heads = kv_n_heads
+        self.sliding_window_size = sliding_window_size
 
         self.head_dim = d_model // n_heads
 
@@ -569,6 +608,8 @@ def forward(
         rotary_emb_w_meta_info: Optional[dict] = None,
         is_causal: bool = True,
         needs_weights: bool = False,
+        query_attention_mask_in_length: Optional[torch.Tensor] = None,
+        key_attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[
             torch.Tensor, torch.Tensor]]]:
         qkv = self.Wqkv(x)
@@ -640,6 +681,10 @@ def forward(
             dropout_p=self.attn_dropout_p,
             training=self.training,
             needs_weights=needs_weights,
+            query_attention_mask_in_length=query_attention_mask_in_length,
+            key_attention_mask_in_length=key_attention_mask_in_length,
+            should_repeat_kv_for_gqa=not is_flash_v2_installed(),
+            sliding_window_size=self.sliding_window_size,
         )
 
         return self.out_proj(context), attn_weights, past_key_value
@@ -665,6 +710,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__(
             d_model=d_model,
@@ -679,6 +725,7 @@ def __init__(
             fc_type=fc_type,
             device=device,
             bias=bias,
+            sliding_window_size=sliding_window_size,
         )
 
 
@@ -702,6 +749,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         bias: bool = True,
+        sliding_window_size: int = -1,
     ):
         super().__init__(
             d_model=d_model,
@@ -716,6 +764,7 @@ def __init__(
             fc_type=fc_type,
             device=device,
             bias=bias,
+            sliding_window_size=sliding_window_size,
         )
 
 

@@ -21,6 +21,7 @@
     'softmax_scale': None,
     'prefix_lm': False,
     'attn_uses_sequence_id': False,
+    'sliding_window_size': -1,
     'alibi': False,
     'alibi_bias_max': 8,
     'rope': False,
@@ -113,6 +114,8 @@ def forward(
         attention_mask: Optional[torch.ByteTensor] = None,
         is_causal: bool = True,
         output_attentions: bool = False,
+        query_attention_mask_in_length: Optional[torch.Tensor] = None,
+        key_attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
             torch.Tensor, torch.Tensor]]]:
         a = self.norm_1(x)
@@ -124,6 +127,8 @@ def forward(
             attention_mask=attention_mask,
             is_causal=is_causal,
             needs_weights=output_attentions,
+            query_attention_mask_in_length=query_attention_mask_in_length,
+            key_attention_mask_in_length=key_attention_mask_in_length,
         )
         x = x + self.resid_attn_dropout(b)
         m = x

@@ -91,6 +91,7 @@ def __init__(
                     When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
                     which sub-sequence each token belongs to.
                     Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                sliding_window_size (int): Window size for sliding window local attention. Defaults to -1, which means no sliding window. Query at position i will only attend to keys between [i + seqlen_k - seqlen_q - window_size, i + seqlen_k - seqlen_q + window_size] inclusive. Only works for flash attention v2.3.0 or higher.
                 alibi (bool): Whether to use the alibi bias instead of position embeddings.
                 alibi_bias_max (int): The maximum value of the alibi bias.
                 rope (bool): Whether to use rotary positional embeddings.
@@ -221,10 +222,12 @@ def _validate_config(self) -> None:
         ]:
             raise NotImplementedError(
                 'alibi only implemented with torch and triton attention.')
-        if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
+        if self.attn_config['attn_uses_sequence_id'] and not (
+                self.attn_config['attn_impl'] in ['torch', 'triton'] or
+            (self.attn_config['attn_impl'] == 'flash' and
+             is_flash_v2_installed(v2_version='v2.1.2'))):
             raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'
+                'attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.'
             )
         if self.attn_config['rope'] and (self.attn_config['rope_impl']
                                          not in ['dail', 'hf']):
@@ -251,6 +254,12 @@ def _validate_config(self) -> None:
                 raise ImportError(
                     'If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support'
                 )
+        if self.attn_config['sliding_window_size'] != -1 and not (
+                self.attn_config['attn_impl'] == 'flash' and
+                is_flash_v2_installed(v2_version='v2.3.0')):
+            raise NotImplementedError(
+                'sliding window only implemented with flash attention v2.3.0 or higher.'
+            )
         if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
             raise ValueError(
                 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!'