diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex
index 084848f3..9d09ca2a 100644
--- a/lib/bumblebee/text/m2m100.ex
+++ b/lib/bumblebee/text/m2m100.ex
@@ -77,6 +77,101 @@ defmodule Bumblebee.Text.M2m100 do
 
   @moduledoc """
   M2M100 model family.
+
+  ## Architectures
+
+    * `:base` - plain M2M100 without any head on top
+
+    * `:for_conditional_generation` - M2M100 with a language modeling
+      head. The head returns logits for each token in the original
+      sequence
+
+  ## Inputs
+
+    * `"input_ids"` - `{batch_size, sequence_length}`
+
+      Indices of input sequence tokens in the vocabulary.
+
+    * `"attention_mask"` - `{batch_size, sequence_length}`
+
+      Mask indicating which tokens to attend to. This is used to ignore
+      padding tokens, which are added when processing a batch of sequences
+      with different length.
+
+    * `"position_ids"` - `{batch_size, sequence_length}`
+
+      Indices of positions of each input sequence tokens in the position
+      embeddings.
+
+    * `"attention_head_mask"` - `{encoder_num_blocks, encoder_num_attention_heads}`
+
+      Mask to nullify selected heads of the self-attention blocks in
+      the encoder.
+
+    * `"input_embeddings"` - `{batch_size, sequence_length, hidden_size}`
+
+      Embedded representation of `"input_ids"`, which can be specified
+      for more control over how `"input_ids"` are embedded than the
+      model's internal embedding lookup. If `"input_embeddings"` are present,
+      then `"input_ids"` will be ignored.
+
+    * `"decoder_input_ids"` - `{batch_size, target_sequence_length}`
+
+      Indices of decoder input sequence tokens in the vocabulary. If not
+      present and `"input_ids"` is, it will be generated by shifting
+      each token in `"input_ids"` to the right once.
+
+    * `"decoder_attention_mask"` - `{batch_size, target_sequence_length}`
+
+      Mask indicating which decoder tokens to attend to. This is used
+      to ignore padding tokens, which are added when processing a batch
+      of sequences with different length.
+
+    * `"decoder_position_ids"` - `{batch_size, target_sequence_length}`
+
+      Indices of positions of each decoder input sequence tokens in
+      the position embeddings.
+
+    * `"decoder_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}`
+
+      Mask to nullify selected heads of the self-attention blocks in
+      the decoder.
+
+    * `"decoder_input_embeddings"` - `{batch_size, sequence_length, hidden_size}`
+
+      Embedded representation of `"decoder_input_ids"`, which can be
+      specified for more control over how `"decoder_input_ids"` are
+      embedded than the model's internal embedding lookup. If
+      `"decoder_input_embeddings"` are present, then `"decoder_input_ids"`
+      will be ignored.
+
+    * `"encoder_hidden_state"` - `{batch_size, sequence_length, hidden_size}`
+
+      Last hidden state output from the encoder. This hidden state is
+      used in cross-attention blocks in the decoder. If specified, the
+      model will skip the encoding process and use this value directly
+      for cross-attentions in the decoder.
+
+    * `"cross_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}`
+
+      Mask to nullify selected heads of the cross-attention blocks in
+      the decoder with shape.
+
+    * `"cache"`
+
+      A container with cached layer results used to speed up sequential
+      decoding (autoregression). With cache, certain hidden states are
+      taken from the cache, rather than recomputed on every decoding
+      pass. The cache should be treated as opaque and initialized with
+      `Bumblebee.Text.Generation.init_cache/4`.
+
+  ## Global layer options
+
+  #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])}
+
+  ## Configuration
+
+  #{Shared.options_doc(options)}
   """
 
   defstruct [architecture: :base] ++ Shared.option_defaults(options)