mosaicml · bcui19 · Dec 11, 2023 · Dec 4, 2023 · Dec 4, 2023 · Dec 4, 2023
@@ -12,6 +12,11 @@
 from llmfoundry.models.layers.ffn import FFN_CLASS_REGISTRY, build_ffn
 from llmfoundry.models.layers.norm import NORM_CLASS_REGISTRY
 
+try:
+    from flash_attn.bert_padding import unpad_input, pad_input  # type: ignore # yapf: disable # isort: skip
+except:
+    unpad_input, pad_input = None, None
+
 attn_config_defaults: Dict = {
     'attn_type': 'multihead_attention',
     'attn_pdrop': 0.0,
@@ -53,6 +58,7 @@ def __init__(
         fc_type: str = 'torch',
         device: Optional[str] = None,
         no_bias: bool = False,
+        use_pad_tok_in_ffn: bool = True,
         **kwargs: Any,
     ):
         if attn_config is None:
@@ -105,6 +111,8 @@ def __init__(
         self.resid_attn_dropout = nn.Dropout(resid_pdrop)
         self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
 
+        self.use_pad_tok_in_ffwd = use_pad_tok_in_ffwd
+
     def forward(
         self,
         x: torch.Tensor,
@@ -132,6 +140,14 @@ def forward(
         m = x
         if self.norm_2 is not None:
             m = self.norm_2(x)
+        batch_size, seq_len= m.size[:2]
+        if not self.use_pad_tok_in_ffwd:
+            if unpad_input is None:
+                raise RuntimeError(
+                    'Please install flash-attn==1.0.9 or flash-attn==2.3.2')
+            m, indices, _, _ = unpad_input(m, attention_mask)
         n = self.ffn(m)
+        if not self.use_pad_tok_in_ffwd:
+            n = pad_input(n, indices, batch_size, seq_len)
         x = x + self.resid_ffn_dropout(n)
         return x, attn_weights, past_key_value
@@ -60,6 +60,7 @@ def __init__(
         init_config: Dict = init_config_defaults,
         fc_type: str = 'torch',
         tie_word_embeddings: bool = True,
+        use_pad_tok_in_ffwd: bool = True,
         verbose: Optional[int] = None,
         **kwargs: Any,
     ):
@@ -131,6 +132,7 @@ def __init__(
                 See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
             fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs.
             tie_word_embeddings (bool): Whether to tie the input embedding and output layers.
+            use_pad_tok_in_ffwd (bool): Whether to forward the pad token in the feedforwards networks.
         """
         self.d_model = d_model
         self.n_heads = n_heads
@@ -151,6 +153,7 @@ def __init__(
         self.use_cache = use_cache
         self.init_config = init_config
         self.fc_type = fc_type
+        self.use_pad_tok_in_ffwd = use_pad_tok_in_ffwd
         if verbose is not None:
             warnings.warn(
                 DeprecationWarning(

@@ -419,7 +419,7 @@ def _attn_bias(
             attn_bias = attn_bias.masked_fill(
                 ~attention_mask.view(-1, 1, 1, s_k), min_val)
 
-        return attn_bias, None
+        return attn_bias, attention_mask
 
     def _apply_prefix_mask(self, attn_bias: torch.Tensor,
                            prefix_mask: torch.Tensor) -> torch.Tensor:

diff --git a/tests/models/test_model.py b/tests/models/test_model.py
@@ -698,8 +698,10 @@ def test_sequence_id_based_masking(attention_impl: str, pos_emb_config: dict):
     },
 }])
 @pytest.mark.parametrize('tie_word_embeddings', [True, False])
+@pytest.mark.parametrize('use_pad_tok_in_ffwd', [True, False])
 def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
-                              tie_word_embeddings: bool):
+                              tie_word_embeddings: bool,
+                              use_pad_tok_in_ffwd: bool):
     # Test that different placement of padding does not affect the output.
     alibi = pos_emb_config['alibi']
     if alibi and attention_impl == 'flash':
@@ -731,6 +733,7 @@ def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
             'init_std': 0.02,
         },
         tie_word_embeddings=tie_word_embeddings,
+        use_pad_tok_in_ffwd=use_pad_tok_in_ffwd,
     )
     mpt = MPTForCausalLM(hf_config)
     mpt.eval()