mosaicml · ShashankMosaicML · Dec 4, 2023 · Oct 9, 2023 · Oct 27, 2023 · Nov 6, 2023
@@ -90,8 +90,12 @@ def scaled_multihead_dot_product_attention(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
+    del query_attention_mask_in_length, key_attention_mask_in_length, should_repeat_kv_for_gqa
 
     if multiquery:
         warnings.warn(
@@ -219,6 +223,9 @@ def flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
     try:
@@ -260,47 +267,69 @@ def flash_attn_fn(
 
     batch_size, seqlen = query.shape[:2]
 
-    if key_padding_mask is None:
-        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
-    query_padding_mask = key_padding_mask[:, -query.size(1):]
+    if query_attention_mask_in_length is None:
+        if key_padding_mask is None:
+            key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+        query_padding_mask = key_padding_mask[:, -query.size(1):]
 
-    query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
-        query, query_padding_mask)
-    query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+        query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
+            query, query_padding_mask)
+        query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
 
-    key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
-        key, key_padding_mask)
-    key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+        key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
+            key, key_padding_mask)
+        key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
 
-    value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
-    value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
-
-    # multi-query case
-    if kv_n_heads == 1:
-        # Expanding a tensor does not allocate new memory, but only creates a new
-        # view on the existing tensor where a dimension of size one is expanded
-        # to a larger size by setting the stride to 0.
-        # - pytorch docs
-        #
-        # hopefully the kernels can utilize this and we're jot just wasting BW here
-        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
-                                     key_unpad.size(-1))
-        value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
-                                         value_unpad.size(-1))
-    # grouped query case
-    elif kv_n_heads < n_heads:
-        # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
-        # We repeat each kv head by the group size number to use the underlying MHA kernels
-
-        # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
-        # we use .view to modify {key, value}_unpad appropriately
-
-        key_unpad = repeat_kv_for_gqa(
-            key_unpad.view(batch_size, seqlen, kv_n_heads, -1),
-            n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
-        value_unpad = repeat_kv_for_gqa(
-            value_unpad.view(batch_size, seqlen, kv_n_heads, -1),
-            n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
+        value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
+        value_unpad = rearrange(value_unpad,
+                                'nnz (h d) -> nnz h d',
+                                h=kv_n_heads)
+    else:
+        if key_attention_mask_in_length is None:
+            raise ValueError(
+                'key_attention_mask_in_length must not be None if query_attention_mask_in_length is not None.'
+            )
+        query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input_for_concatenated_sequences(
+            query, query_attention_mask_in_length)
+        query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
+
+        key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input_for_concatenated_sequences(
+            key, key_attention_mask_in_length)
+        key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=kv_n_heads)
+
+        value_unpad, _, _, _ = bert_padding.unpad_input_for_concatenated_sequences(
+            value, key_attention_mask_in_length)
+        value_unpad = rearrange(value_unpad,
+                                'nnz (h d) -> nnz h d',
+                                h=kv_n_heads)
+
+    if should_repeat_kv_for_gqa:
+        # multi-query case
+        if kv_n_heads == 1:
+            # Expanding a tensor does not allocate new memory, but only creates a new
+            # view on the existing tensor where a dimension of size one is expanded
+            # to a larger size by setting the stride to 0.
+            # - pytorch docs
+            #
+            # hopefully the kernels can utilize this and we're jot just wasting BW here
+            key_unpad = key_unpad.expand(key_unpad.size(0), n_heads,
+                                        key_unpad.size(-1))
+            value_unpad = value_unpad.expand(value_unpad.size(0), n_heads,
+                                            value_unpad.size(-1))
+        # grouped query case
+        elif kv_n_heads < n_heads:
+            # Each query belong to a group of kv heads of group size n_heads // kv_n_heads
+            # We repeat each kv head by the group size number to use the underlying MHA kernels
+
+            # since repeat_kv_for_gqa expects input dims of (b, s, kv_n_heads, d)
+            # we use .view to modify {key, value}_unpad appropriately
+
+            key_unpad = repeat_kv_for_gqa(
+                key_unpad.view(batch_size, seqlen, kv_n_heads, -1),
+                n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
+            value_unpad = repeat_kv_for_gqa(
+                value_unpad.view(batch_size, seqlen, kv_n_heads, -1),
+                n_heads // kv_n_heads).view(batch_size * seqlen, n_heads, -1)
 
     dropout_p = dropout_p if training else 0.0
 
@@ -357,8 +386,12 @@ def triton_flash_attn_fn(
     training: bool = False,
     needs_weights: bool = False,
     multiquery: bool = False,
+    query_attention_mask_in_length: Optional[torch.Tensor] = None,
+    key_attention_mask_in_length: Optional[torch.Tensor] = None,
+    should_repeat_kv_for_gqa: Optional[bool] = True,
 ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor,
                                                                 torch.Tensor]]]:
+    del query_attention_mask_in_length, key_attention_mask_in_length, should_repeat_kv_for_gqa
     try:
         from llmfoundry.models.layers.flash_attn_triton import flash_attn_func
     except:
@@ -569,6 +602,8 @@ def forward(
         rotary_emb_w_meta_info: Optional[dict] = None,
         is_causal: bool = True,
         needs_weights: bool = False,
+        query_attention_mask_in_length: Optional[torch.Tensor] = None,
+        key_attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[
             torch.Tensor, torch.Tensor]]]:
         qkv = self.Wqkv(x)
@@ -640,6 +675,9 @@ def forward(
             dropout_p=self.attn_dropout_p,
             training=self.training,
             needs_weights=needs_weights,
+            query_attention_mask_in_length=query_attention_mask_in_length,
+            key_attention_mask_in_length=key_attention_mask_in_length,
+            should_repeat_kv_for_gqa= not is_flash_v2_installed(),
         )
 
         return self.out_proj(context), attn_weights, past_key_value

@@ -113,6 +113,8 @@ def forward(
         attention_mask: Optional[torch.ByteTensor] = None,
         is_causal: bool = True,
         output_attentions: bool = False,
+        query_attention_mask_in_length: Optional[torch.Tensor] = None,
+        key_attention_mask_in_length: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[
             torch.Tensor, torch.Tensor]]]:
         a = self.norm_1(x)
@@ -124,6 +126,8 @@ def forward(
             attention_mask=attention_mask,
             is_causal=is_causal,
             needs_weights=output_attentions,
+            query_attention_mask_in_length=query_attention_mask_in_length,
+            key_attention_mask_in_length=key_attention_mask_in_length,
         )
         x = x + self.resid_attn_dropout(b)
         m = x

@@ -222,9 +222,9 @@ def _validate_config(self) -> None:
             raise NotImplementedError(
                 'alibi only implemented with torch and triton attention.')
         if self.attn_config['attn_uses_sequence_id'] and self.attn_config[
-                'attn_impl'] not in ['torch', 'triton']:
+                'attn_impl'] not in ['torch', 'triton'] and not (self.attn_config['attn_impl']=='flash' and is_flash_v2_installed(v2_version='v2.1.2')):
             raise NotImplementedError(
-                'attn_uses_sequence_id only implemented with torch and triton attention.'
+                'attn_uses_sequence_id only implemented with torch, triton, and flash (v2.1.2 or higher) attention.'
             )
         if self.attn_config['rope'] and (self.attn_config['rope_impl']
                                          not in ['dail', 'hf']):

@@ -131,6 +131,44 @@ def gen_rotary_embedding(rope_head_dim: int, rope_impl: str, rope_theta: int,
             )
     raise ValueError('rope_impl needs to be either dail or hf')
 
+def gen_attention_mask_in_length(sequence_id: Union[None, torch.Tensor], S: int, attn_uses_sequence_id: bool, attn_impl: str):
+        # Generates the attention masks used for sequence masking in flash attention
+        query_attention_mask_in_length = None
+        key_attention_mask_in_length = None
+        if (sequence_id is not None) and attn_uses_sequence_id and (attn_impl == 'flash'):
+            query_attention_mask_in_length = torch.nn.functional.one_hot(sequence_id[:, -S:], num_classes=S).sum(dim=1)
+            # We use S as the number of classes while creating key_attention_mask_in_length instead of sequence_id.shape[-1] 
+            # because in case of inference, sequence_id.shape[-1] can become very large. In that case, the one_hot vectors 
+            # would've become very large as well.
+            key_attention_mask_in_length = torch.nn.functional.one_hot(sequence_id, num_classes=S).sum(dim=1)
+            # Since Flash Attention expects the masks to have same shape as the keys, we pad it with zeros.
+            key_attention_mask_in_length = torch.nn.functional.pad(key_attention_mask_in_length, (0, sequence_id.shape[-1] - S), value=0)
+
+        return query_attention_mask_in_length,key_attention_mask_in_length
+
+def apply_sequence_id(attn_bias: torch.Tensor,
+                        sequence_id: torch.LongTensor,
+                        max_seq_len: int) -> torch.Tensor:
+    seq_len = sequence_id.shape[-1]
+    if seq_len > max_seq_len:
+        raise ValueError(
+            f'sequence_id sequence length cannot exceed max_seq_len={max_seq_len}'
+        )
+
+    # select seq_len subset of attn mask
+    attn_bias = attn_bias[..., :seq_len, :seq_len]
+
+    # Restrict attention to tokens that share the same value
+    # in sequence_id
+    cannot_attend = torch.logical_not(
+        torch.eq(
+            sequence_id.view(-1, seq_len, 1),
+            sequence_id.view(-1, 1, seq_len),
+        )).unsqueeze(1)
+    min_val = torch.finfo(attn_bias.dtype).min
+    attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+
+    return attn_bias
 
 class MPTPreTrainedModel(PreTrainedModel):
     config_class = MPTConfig
@@ -286,7 +324,7 @@ def _attn_bias(
         # If using torch or triton, we incorporate sequence_id (if appropriate)
         if self.attn_uses_sequence_id and sequence_id is not None:
             assert isinstance(attn_bias, torch.Tensor)  # pyright
-            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+            attn_bias = apply_sequence_id(attn_bias, sequence_id, self.config.max_seq_len)
 
         # If using torch or triton, we incorporate attention_mask. This will output
         # None in place of attention_mask since it will not be further needed in the
@@ -343,29 +381,6 @@ def _apply_prefix_mask(self, attn_bias: torch.Tensor,
 
         return attn_bias
 
-    def _apply_sequence_id(self, attn_bias: torch.Tensor,
-                           sequence_id: torch.LongTensor) -> torch.Tensor:
-        seq_len = sequence_id.shape[-1]
-        if seq_len > self.config.max_seq_len:
-            raise ValueError(
-                f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
-            )
-
-        # select seq_len subset of attn mask
-        attn_bias = attn_bias[..., :seq_len, :seq_len]
-
-        # Restrict attention to tokens that share the same value
-        # in sequence_id
-        cannot_attend = torch.logical_not(
-            torch.eq(
-                sequence_id.view(-1, seq_len, 1),
-                sequence_id.view(-1, 1, seq_len),
-            )).unsqueeze(1)
-        min_val = torch.finfo(attn_bias.dtype).min
-        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
-
-        return attn_bias
-
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -509,7 +524,7 @@ def forward(
             prefix_mask=prefix_mask,
             sequence_id=sequence_id,
         )
-
+        query_attention_mask_in_length, key_attention_mask_in_length = gen_attention_mask_in_length(sequence_id=sequence_id, S=S, attn_uses_sequence_id=self.attn_uses_sequence_id, attn_impl=self.attn_impl)
         # initialize the past key values cache if it should be used
         presents = () if use_cache else None
         if use_cache and past_key_values is None:
@@ -532,6 +547,8 @@ def forward(
                 attention_mask=attention_mask,
                 is_causal=self.is_causal,
                 output_attentions=bool(output_attentions),
+                query_attention_mask_in_length=query_attention_mask_in_length,
+                key_attention_mask_in_length=key_attention_mask_in_length,
             )
             if presents is not None:
                 presents += (present,)