From 049b00f61c9bb17bd2b20a3b77d04cc4c0f20d86 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:51:01 +0200
Subject: [PATCH] Add Transformers v4.45 support (#2023)

* transformers v4.45 support

* fix transformers v4.45 compatibility

* update opset

* update model

* Add generation config saving

* fix codegen

* bump default opset m2m100

* fix codegen

* fix bettertransformers

* add warnign deprecation bettertransformer

* bettertransformers fixes

* disable transformers 4.45 for onnx export

* update model ID
---
 Makefile                                      |  4 +-
 optimum/bettertransformer/models/attention.py | 84 +++++++++++++++++--
 .../models/decoder_models.py                  | 35 +++++++-
 optimum/bettertransformer/transformation.py   |  4 +
 optimum/exporters/onnx/convert.py             | 18 ++++
 optimum/exporters/onnx/model_configs.py       | 11 +--
 optimum/modeling_base.py                      |  3 +
 optimum/onnxruntime/modeling_decoder.py       | 58 ++++++++-----
 optimum/onnxruntime/modeling_ort.py           |  3 -
 optimum/onnxruntime/modeling_seq2seq.py       | 64 +++++++-------
 optimum/onnxruntime/optimization.py           | 12 ++-
 setup.py                                      |  9 +-
 tests/bettertransformer/testing_utils.py      |  4 +-
 tests/onnxruntime/utils_onnxruntime_tests.py  |  6 +-
 14 files changed, 223 insertions(+), 92 deletions(-)

diff --git a/Makefile b/Makefile
index e2c2126303..824ef3d0cf 100644
--- a/Makefile
+++ b/Makefile
@@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 # Run code quality checks
 style_check:
 	black --check .
-	ruff .
+	ruff check .
 
 style:
 	black .
-	ruff . --fix
+	ruff check . --fix
 
 # Run tests for the library
 test:
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 9dfa57844d..22b8faf1c2 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -92,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product(
     return sdpa_result, None
 
 
+# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn
+def gptj_wrapped_scaled_dot_product(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        query = query.to(value.dtype)
+        key = key.to(value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this
+        # is unnecessary
+        if query_length > 1:
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+                causal_mask = torch.where(causal_mask, 0, mask_value)
+
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                if attention_mask is not None:
+                    attention_mask = causal_mask + attention_mask
+
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        sdpa_result = sdpa_result.to(value.dtype)
+
+    return sdpa_result, None
+
+
 # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn
 def bark_wrapped_scaled_dot_product(
     self,
@@ -195,7 +260,7 @@ def codegen_wrapped_scaled_dot_product(
                 query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
             )
         else:
-            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # in this case, which is the later decoding steps, the `causal_mask` in
             # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
             # is [True, ..., True] so actually not causal
             sdpa_result = torch.nn.functional.scaled_dot_product_attention(
@@ -207,15 +272,20 @@ def codegen_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(
+                    torch.bool
+                )
 
-            causal_mask = torch.where(causal_mask, 0, mask_value)
+                causal_mask = torch.where(causal_mask, 0, mask_value)
 
-            # torch.Tensor.expand does no memory copy
-            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
 
-            # we use torch.min to avoid having tensor(-inf)
-            attention_mask = torch.min(causal_mask, attention_mask)
+                # we use torch.min to avoid having tensor(-inf)
+                attention_mask = torch.min(causal_mask, attention_mask)
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index b64b7f5a1e..52d28d076d 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -44,6 +44,7 @@
     codegen_wrapped_scaled_dot_product,
     gpt2_wrapped_scaled_dot_product,
     gpt_neo_wrapped_scaled_dot_product,
+    gptj_wrapped_scaled_dot_product,
     opt_forward,
     t5_forward,
 )
@@ -82,7 +83,7 @@ def forward(self, *args, **kwargs):
 
 
 class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module):
-    _attn = gpt2_wrapped_scaled_dot_product
+    _attn = gptj_wrapped_scaled_dot_product
 
     def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         super().__init__(config)
@@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             "out_proj",
             "attn_dropout",
             "resid_dropout",
-            "bias",
             "scale_attn",
-            "masked_bias",
         ]
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "bias"):
+            submodules.append("bias")
+        if hasattr(layer, "masked_bias"):
+            submodules.append("masked_bias")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_id"):
+            submodules.append("layer_id")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -238,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             super(BetterTransformerBaseLayer, self).__init__(config)
 
         self.module_mapping = None
-        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"]
+        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"]
 
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "causal_mask"):
+            submodules.append("causal_mask")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index 2105e19987..a101757b6f 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -206,6 +206,10 @@ def transform(
             The converted model if the conversion has been successful.
         """
 
+        logger.warning(
+            "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release."
+        )
+
         hf_config = model.config
         if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]:
             raise ValueError(
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 63a9067b90..f2bf95f3e3 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import onnx
+import transformers
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -34,6 +35,7 @@
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
     TORCH_MINIMUM_VERSION,
+    check_if_transformers_greater,
     is_diffusers_available,
     is_torch_onnx_support_available,
     logging,
@@ -999,6 +1001,10 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
+    if check_if_transformers_greater("4.44.99"):
+        raise ImportError(
+            f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}"
+        )
 
     TasksManager.standardize_model_attributes(model)
 
@@ -1120,6 +1126,18 @@ def onnx_export_from_model(
             if isinstance(atol, dict):
                 atol = atol[task.replace("-with-past", "")]
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            if model.can_generate() and len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(model.generation_config, param_name, param_value)
+                    setattr(model.config, param_name, None)
+
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d4b15b2968..36963a986d 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class AlbertOnnxConfig(BertOnnxConfig):
-    DEFAULT_ONNX_OPSET = 11
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class ConvBertOnnxConfig(BertOnnxConfig):
@@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig):
 
 
 class RobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class CamembertOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class FlaubertOnnxConfig(BertOnnxConfig):
@@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig):
 
 
 class XLMRobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class DebertaOnnxConfig(BertOnnxConfig):
@@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig):
 
 
 class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads")
 
 
@@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 3da2d9d0d2..29521b7c0c 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -371,6 +371,9 @@ def from_pretrained(
             export = from_transformers
 
         if len(model_id.split("@")) == 2:
+            logger.warning(
+                f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead."
+            )
             if revision is not None:
                 logger.warning(
                     f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}"
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index f6d4b7e20a..bda3ec98d9 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -14,7 +14,6 @@
 """Classes handling causal-lm related architectures in ONNX Runtime."""
 
 import logging
-import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -149,6 +148,19 @@ def __init__(
             generation_config = GenerationConfig.from_model_config(config)
 
         self.generation_config = generation_config
+
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
         self.onnx_paths = [self.model_path]
         self.use_merged = "use_cache_branch" in self.input_names
         self.model_type = self.config.model_type
@@ -393,7 +405,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -410,15 +421,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -586,6 +589,22 @@ def _from_pretrained(
         else:
             init_cls = ORTModelForCausalLM
 
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
+
         return init_cls(
             model=model,
             config=config,
@@ -593,6 +612,7 @@ def _from_pretrained(
             model_save_dir=model_save_dir,
             preprocessors=preprocessors,
             use_cache=use_cache,
+            generation_config=generation_config,
         )
 
     @classmethod
@@ -600,7 +620,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -616,15 +635,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         file_name = ONNX_WEIGHTS_NAME
 
         if use_merged:
@@ -655,8 +665,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -712,6 +720,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
             for layer_past in past
         )
 
+    def _save_pretrained(self, save_directory: Union[str, Path]):
+        super()._save_pretrained(save_directory)
+        self.generation_config.save_pretrained(save_directory)
+
 
 class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 9166f7c2cb..17bd3e2a4e 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -663,8 +663,6 @@ def _export(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -1171,7 +1169,6 @@ def _export(
             library_name="transformers",
         )
 
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 3cecadafe3..fda3ca82bb 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -18,7 +18,6 @@
 
 import logging
 import shutil
-import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -706,6 +705,18 @@ def show_deprecated_argument(arg_name):
             generation_config = GenerationConfig.from_model_config(config)
         self.generation_config = generation_config
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
     @abstractmethod
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         pass
@@ -780,7 +791,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -799,15 +809,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -996,19 +998,21 @@ def _from_pretrained(
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
-        generation_config = None
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder=subfolder,
-            )
-        except OSError:
-            logger.info("Generation config file not found, using a generation config created from the model config.")
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
 
         onnx_paths = [encoder_path]
         if use_merged is False:
@@ -1035,7 +1039,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -1051,15 +1054,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForConditionalGeneration":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         if use_cache is False and use_merged is True:
             raise ValueError(
                 "The incompatible arguments use_cache=False, use_merged=True were passed to"
@@ -1091,8 +1085,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py
index 9e62a3f324..fd6958bba7 100644
--- a/optimum/onnxruntime/optimization.py
+++ b/optimum/onnxruntime/optimization.py
@@ -20,6 +20,7 @@
 
 import onnx
 from onnx import load_model
+from transformers import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 
 from onnxruntime.transformers.onnx_model_bert import BertOnnxModel
@@ -152,10 +153,6 @@ def optimize(
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config)
-
-        self.config.save_pretrained(save_dir)
-        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
-
         model_type = ORTConfigManager.get_model_ort_type(self.config.model_type)
         optimization_options = optimization_config.create_fusion_options(model_type)
 
@@ -236,6 +233,13 @@ def optimize(
         # Save the model configuration
         self.config.save_pretrained(save_dir)
         ort_config.save_pretrained(save_dir)
+        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
+
+        try:
+            generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent)
+            generation_config.save_pretrained(save_dir)
+        except Exception:
+            pass
 
         logger.info(
             f"Optimized model saved at: {save_dir} (external data format: "
diff --git a/setup.py b/setup.py
index ac5db71a74..24c1ae1cd4 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.29,<4.45.0",
+    "transformers[sentencepiece]>=4.29,<4.46.0",
     "torch>=1.11",
     "packaging",
     "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
@@ -54,6 +54,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
+        "transformers<4.45.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -62,9 +63,10 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
+        "transformers<4.45.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
+    "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"],
+    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -75,6 +77,7 @@
         "numpy<1.24.0",
         "datasets<=2.16",
         "transformers[sentencepiece]>=4.26,<4.38",
+        "transformers<4.45.0",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index e9e2edd979..098882180a 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -59,12 +59,12 @@
     # "llama": "fxmarty/tiny-llama-fast-tokenizer",
     # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer",
     "m2m_100": "hf-internal-testing/tiny-random-nllb",
-    "marian": "fxmarty/tiny-marian",  # the other tiny ones have a too small max_position_embeddings
+    "marian": "optimum-internal-testing/tiny-random-marian",  # the other tiny ones have a too small max_position_embeddings
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
-    "prophetnet": "hirotasoshu/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
+    "prophetnet": "optimum-internal-testing/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
     "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "rocbert": "hf-internal-testing/tiny-random-RoCBertModel",
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 0790f6329d..17f3b391b0 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -112,9 +112,9 @@
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama": "optimum-internal-testing/tiny-random-llama",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
-    "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
+    "marian": "echarlaix/tiny-random-marian",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
@@ -152,7 +152,7 @@
     "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel",
     "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2",
     "vit": "hf-internal-testing/tiny-random-vit",
-    "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
+    "whisper": "optimum-internal-testing/tiny-random-whisper",
     "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model",
     "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer",
     "wavlm": "hf-internal-testing/tiny-random-WavlmModel",