diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex index 084848f3..9d09ca2a 100644 --- a/lib/bumblebee/text/m2m100.ex +++ b/lib/bumblebee/text/m2m100.ex @@ -77,6 +77,101 @@ defmodule Bumblebee.Text.M2m100 do @moduledoc """ M2M100 model family. + + ## Architectures + + * `:base` - plain M2M100 without any head on top + + * `:for_conditional_generation` - M2M100 with a language modeling + head. The head returns logits for each token in the original + sequence + + ## Inputs + + * `"input_ids"` - `{batch_size, sequence_length}` + + Indices of input sequence tokens in the vocabulary. + + * `"attention_mask"` - `{batch_size, sequence_length}` + + Mask indicating which tokens to attend to. This is used to ignore + padding tokens, which are added when processing a batch of sequences + with different length. + + * `"position_ids"` - `{batch_size, sequence_length}` + + Indices of positions of each input sequence tokens in the position + embeddings. + + * `"attention_head_mask"` - `{encoder_num_blocks, encoder_num_attention_heads}` + + Mask to nullify selected heads of the self-attention blocks in + the encoder. + + * `"input_embeddings"` - `{batch_size, sequence_length, hidden_size}` + + Embedded representation of `"input_ids"`, which can be specified + for more control over how `"input_ids"` are embedded than the + model's internal embedding lookup. If `"input_embeddings"` are present, + then `"input_ids"` will be ignored. + + * `"decoder_input_ids"` - `{batch_size, target_sequence_length}` + + Indices of decoder input sequence tokens in the vocabulary. If not + present and `"input_ids"` is, it will be generated by shifting + each token in `"input_ids"` to the right once. + + * `"decoder_attention_mask"` - `{batch_size, target_sequence_length}` + + Mask indicating which decoder tokens to attend to. This is used + to ignore padding tokens, which are added when processing a batch + of sequences with different length. + + * `"decoder_position_ids"` - `{batch_size, target_sequence_length}` + + Indices of positions of each decoder input sequence tokens in + the position embeddings. + + * `"decoder_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}` + + Mask to nullify selected heads of the self-attention blocks in + the decoder. + + * `"decoder_input_embeddings"` - `{batch_size, sequence_length, hidden_size}` + + Embedded representation of `"decoder_input_ids"`, which can be + specified for more control over how `"decoder_input_ids"` are + embedded than the model's internal embedding lookup. If + `"decoder_input_embeddings"` are present, then `"decoder_input_ids"` + will be ignored. + + * `"encoder_hidden_state"` - `{batch_size, sequence_length, hidden_size}` + + Last hidden state output from the encoder. This hidden state is + used in cross-attention blocks in the decoder. If specified, the + model will skip the encoding process and use this value directly + for cross-attentions in the decoder. + + * `"cross_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}` + + Mask to nullify selected heads of the cross-attention blocks in + the decoder with shape. + + * `"cache"` + + A container with cached layer results used to speed up sequential + decoding (autoregression). With cache, certain hidden states are + taken from the cache, rather than recomputed on every decoding + pass. The cache should be treated as opaque and initialized with + `Bumblebee.Text.Generation.init_cache/4`. + + ## Global layer options + + #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])} + + ## Configuration + + #{Shared.options_doc(options)} """ defstruct [architecture: :base] ++ Shared.option_defaults(options)