From a9ffe076057aea68f25bc11d4288c5e7a3c28c3f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 17 Jul 2023 18:39:15 +0200
Subject: [PATCH 01/20] Enable SD XL ONNX export and ONNX Runtime inference
 (#1168)

* add stable diffusion XL export

* fix style

* fix test model name

* fix style

* remove clip with projection from test

* change model name

* fix style

* remove need create pretrainedconfig

* fix style

* fix dummy input generation

* add saving second tokenzier when exporting a SD XL model

* fix style

* add SD XL pipeline

* fix style

* add test

* add watermarker

* fix style

* add watermark

* add test

* set default height width stable diffusion pipeline

* enable img2img task

* fix style

* enable to only have the second tokenizer and text encoder

* add test

* fix cli export

* adapt test for batch size > 1
---
 optimum/exporters/onnx/__main__.py            |  76 +--
 optimum/exporters/onnx/convert.py             |   2 +
 optimum/exporters/onnx/model_configs.py       |  51 +-
 optimum/exporters/onnx/utils.py               |  38 +-
 optimum/exporters/tasks.py                    |  31 +-
 optimum/onnxruntime/__init__.py               |   8 +
 optimum/onnxruntime/modeling_diffusion.py     | 173 ++++--
 optimum/onnxruntime/modeling_ort.py           |   1 -
 .../diffusers/pipeline_stable_diffusion.py    |  64 ++-
 .../pipeline_stable_diffusion_img2img.py      |  35 +-
 .../pipeline_stable_diffusion_inpaint.py      |  10 +-
 .../diffusers/pipeline_stable_diffusion_xl.py | 499 +++++++++++++++++
 .../pipeline_stable_diffusion_xl_img2img.py   | 506 ++++++++++++++++++
 optimum/pipelines/diffusers/pipeline_utils.py |  49 ++
 optimum/pipelines/diffusers/watermark.py      |  27 +
 optimum/utils/__init__.py                     |   1 +
 optimum/utils/constant.py                     |   1 +
 optimum/utils/dummy_diffusers_objects.py      |  22 +
 optimum/utils/import_utils.py                 |   2 +-
 optimum/utils/input_generators.py             |  15 +-
 setup.py                                      |   1 +
 tests/exporters/exporters_utils.py            |   3 +-
 .../exporters/onnx/test_exporters_onnx_cli.py |  28 +-
 tests/exporters/onnx/test_onnx_export.py      |  74 +--
 .../test_stable_diffusion_pipeline.py         | 114 +++-
 tests/onnxruntime/utils_onnxruntime_tests.py  |   1 +
 26 files changed, 1640 insertions(+), 192 deletions(-)
 create mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
 create mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
 create mode 100644 optimum/pipelines/diffusers/watermark.py

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 7200711b53..696cb86823 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -23,15 +23,7 @@
 from transformers.utils import is_torch_available
 
 from ...commands.export.onnx import parse_args_onnx
-from ...utils import (
-    DEFAULT_DUMMY_SHAPES,
-    DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
-    DIFFUSION_MODEL_UNET_SUBFOLDER,
-    DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
-    DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-    ONNX_WEIGHTS_NAME,
-    logging,
-)
+from ...utils import DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, logging
 from ...utils.save_utils import maybe_save_preprocessors
 from ..error_utils import AtolError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
@@ -71,8 +63,9 @@ def _get_submodels_and_onnx_configs(
     custom_architecture: bool,
     fn_get_submodels: Optional[Callable] = None,
 ):
+    is_stable_diffusion = "stable-diffusion" in task
     if not custom_architecture:
-        if task == "stable-diffusion":
+        if is_stable_diffusion:
             onnx_config = None
             models_and_onnx_configs = get_stable_diffusion_models_for_export(model)
         else:
@@ -104,7 +97,7 @@ def _get_submodels_and_onnx_configs(
         if fn_get_submodels is not None:
             submodels_for_export = fn_get_submodels(model)
         else:
-            if task == "stable-diffusion":
+            if is_stable_diffusion:
                 submodels_for_export = _get_submodels_for_export_stable_diffusion(model)
             elif (
                 model.config.is_encoder_decoder
@@ -312,10 +305,19 @@ def main_export(
     )
 
     custom_architecture = False
-    if task != "stable-diffusion" and model.config.model_type.replace(
-        "-", "_"
-    ) not in TasksManager.get_supported_model_type_for_task(task, exporter="onnx"):
-        custom_architecture = True
+    is_stable_diffusion = "stable-diffusion" in task
+    model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
+
+    if not is_stable_diffusion:
+        if model_type in TasksManager._UNSUPPORTED_CLI_MODEL_TYPE:
+            raise ValueError(
+                f"{model_type} is not supported yet. Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
+                f"If you want to support {model_type} please propose a PR or open up an issue."
+            )
+        if model.config.model_type.replace("-", "_") not in TasksManager.get_supported_model_type_for_task(
+            task, exporter="onnx"
+        ):
+            custom_architecture = True
 
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_onnx_configs is None:
@@ -330,9 +332,8 @@ def main_export(
 
     if (
         not custom_architecture
-        and task != "stable-diffusion"
-        and task + "-with-past"
-        in TasksManager.get_supported_tasks_for_model_type(model.config.model_type.replace("_", "-"), "onnx")
+        and not is_stable_diffusion
+        and task + "-with-past" in TasksManager.get_supported_tasks_for_model_type(model_type, "onnx")
     ):
         if original_task == "auto":  # Make -with-past the default if --task was not explicitely specified
             task = task + "-with-past"
@@ -367,7 +368,7 @@ def main_export(
         fn_get_submodels=fn_get_submodels,
     )
 
-    if task != "stable-diffusion":
+    if not is_stable_diffusion:
         needs_pad_token_id = (
             isinstance(onnx_config, OnnxConfigWithPast)
             and getattr(model.config, "pad_token_id", None) is None
@@ -391,7 +392,7 @@ def main_export(
 
         if opset < onnx_config.DEFAULT_ONNX_OPSET:
             raise ValueError(
-                f"Opset {opset} is not sufficient to export {model.config.model_type}. "
+                f"Opset {opset} is not sufficient to export {model_type}. "
                 f"At least {onnx_config.DEFAULT_ONNX_OPSET} is required."
             )
         if atol is None:
@@ -415,28 +416,31 @@ def main_export(
 
         onnx_files_subpaths = None
     else:
-        onnx_files_subpaths = [
-            DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
-            DIFFUSION_MODEL_UNET_SUBFOLDER,
-            DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-            DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
-        ]
-
         # save the subcomponent configuration
-        for model_name, name_dir in zip(models_and_onnx_configs, onnx_files_subpaths):
+        for model_name in models_and_onnx_configs:
             subcomponent = models_and_onnx_configs[model_name][0]
             if hasattr(subcomponent, "save_config"):
-                subcomponent.save_config(output / name_dir)
+                subcomponent.save_config(output / model_name)
             elif hasattr(subcomponent, "config") and hasattr(subcomponent.config, "save_pretrained"):
-                subcomponent.config.save_pretrained(output / name_dir)
+                subcomponent.config.save_pretrained(output / model_name)
 
-        onnx_files_subpaths = [os.path.join(path, ONNX_WEIGHTS_NAME) for path in onnx_files_subpaths]
+        onnx_files_subpaths = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
 
         # Saving the additional components needed to perform inference.
-        model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
         model.scheduler.save_pretrained(output.joinpath("scheduler"))
-        if model.feature_extractor is not None:
-            model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+
+        feature_extractor = getattr(model, "feature_extractor", None)
+        if feature_extractor is not None:
+            feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+
+        tokenizer = getattr(model, "tokenizer", None)
+        if tokenizer is not None:
+            tokenizer.save_pretrained(output.joinpath("tokenizer"))
+
+        tokenizer_2 = getattr(model, "tokenizer_2", None)
+        if tokenizer_2 is not None:
+            tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
+
         model.save_config(output)
 
     _, onnx_outputs = export_models(
@@ -464,7 +468,7 @@ def main_export(
 
     # Optionally post process the obtained ONNX file(s), for example to merge the decoder / decoder with past if any
     # TODO: treating stable diffusion separately is quite ugly
-    if not no_post_process and task != "stable-diffusion":
+    if not no_post_process and not is_stable_diffusion:
         try:
             logger.info("Post-processing the exported models...")
             models_and_onnx_configs, onnx_files_subpaths = onnx_config.post_process_exported_models(
@@ -475,7 +479,7 @@ def main_export(
                 f"The post-processing of the ONNX export failed. The export can still be performed by passing the option --no-post-process. Detailed error: {e}"
             )
 
-    if task == "stable-diffusion":
+    if is_stable_diffusion:
         use_subprocess = (
             False  # TODO: fix Can't pickle local object 'get_stable_diffusion_models_for_export.<locals>.<lambda>'
         )
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 907749227f..cad2fdcb0f 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -369,6 +369,8 @@ def _run_validation(
         if isinstance(value, (list, tuple)):
             value = config.flatten_output_collection_property(name, value)
             onnx_inputs.update({tensor_name: pt_tensor.cpu().numpy() for tensor_name, pt_tensor in value.items()})
+        elif isinstance(value, dict):
+            onnx_inputs.update({tensor_name: pt_tensor.cpu().numpy() for tensor_name, pt_tensor in value.items()})
         else:
             onnx_inputs[name] = value.cpu().numpy()
 
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 20304e6d6c..e2a948b35d 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -658,7 +658,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         }
 
 
-class CLIPTextOnnxConfig(TextEncoderOnnxConfig):
+class CLIPTextWithProjectionOnnxConfig(TextEncoderOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
     # The ONNX export of this architecture needs the Trilu operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -666,6 +666,7 @@ class CLIPTextOnnxConfig(TextEncoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
         vocab_size="vocab_size",
         sequence_length="max_position_embeddings",
+        num_layers="num_hidden_layers",
         allow_new=True,
     )
 
@@ -677,13 +678,33 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
-        return {
+        common_outputs = {
+            "text_embeds": {0: "batch_size", 1: "sequence_length"},
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+        if self._normalized_config.output_hidden_states:
+            for i in range(self._normalized_config.num_layers + 1):
+                common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
+
+        return common_outputs
+
+
+class CLIPTextOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = {
             "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
             "pooler_output": {0: "batch_size"},
         }
+        if self._normalized_config.output_hidden_states:
+            for i in range(self._normalized_config.num_layers + 1):
+                common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
+
+        return common_outputs
 
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+
         if framework == "pt":
             import torch
 
@@ -713,12 +734,19 @@ class UNetOnnxConfig(VisionOnnxConfig):
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
-        return {
+        common_inputs = {
             "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
             "timestep": {0: "steps"},
             "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
         }
 
+        # TODO : add text_image, image and image_embeds
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            common_inputs["text_embeds"] = {0: "batch_size"}
+            common_inputs["time_ids"] = {0: "batch_size"}
+
+        return common_inputs
+
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
@@ -734,8 +762,25 @@ def torch_to_onnx_output_map(self) -> Dict[str, str]:
     def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
         dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
         dummy_inputs["encoder_hidden_states"] = dummy_inputs["encoder_hidden_states"][0]
+
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            dummy_inputs["added_cond_kwargs"] = {
+                "text_embeds": dummy_inputs.pop("text_embeds"),
+                "time_ids": dummy_inputs.pop("time_ids"),
+            }
+
         return dummy_inputs
 
+    def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
+        inputs = super().ordered_inputs(model=model)
+        # to fix mismatch between model forward signature and expected inputs
+        # a dictionnary of additional embeddings `added_cond_kwargs` is expected depending on config.addition_embed_type
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            inputs["text_embeds"] = self.inputs["text_embeds"]
+            inputs["time_ids"] = self.inputs["time_ids"]
+
+        return inputs
+
 
 class VaeEncoderOnnxConfig(VisionOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-2
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 150b99db4f..c1bee9a4da 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -100,14 +100,24 @@ def _get_submodels_for_export_stable_diffusion(
     """
     Returns the components of a Stable Diffusion model.
     """
+    from diffusers import StableDiffusionXLPipeline
+
     models_for_export = {}
+    if isinstance(pipeline, StableDiffusionXLPipeline):
+        projection_dim = pipeline.text_encoder_2.config.projection_dim
+    else:
+        projection_dim = pipeline.text_encoder.config.projection_dim
 
     # Text encoder
-    models_for_export["text_encoder"] = pipeline.text_encoder
+    if pipeline.text_encoder is not None:
+        if isinstance(pipeline, StableDiffusionXLPipeline):
+            pipeline.text_encoder.config.output_hidden_states = True
+        models_for_export["text_encoder"] = pipeline.text_encoder
 
     # U-NET
     # PyTorch does not support the ONNX export of torch.nn.functional.scaled_dot_product_attention
     pipeline.unet.set_attn_processor(AttnProcessor())
+    pipeline.unet.config.text_encoder_projection_dim = projection_dim
     models_for_export["unet"] = pipeline.unet
 
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
@@ -124,6 +134,11 @@ def _get_submodels_for_export_stable_diffusion(
     vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
     models_for_export["vae_decoder"] = vae_decoder
 
+    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
+    if text_encoder_2 is not None:
+        text_encoder_2.config.output_hidden_states = True
+        models_for_export["text_encoder_2"] = text_encoder_2
+
     return models_for_export
 
 
@@ -249,11 +264,12 @@ def get_stable_diffusion_models_for_export(
     models_for_export = _get_submodels_for_export_stable_diffusion(pipeline)
 
     # Text encoder
-    text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=pipeline.text_encoder, exporter="onnx", task="feature-extraction"
-    )
-    text_encoder_onnx_config = text_encoder_config_constructor(pipeline.text_encoder.config)
-    models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_onnx_config)
+    if "text_encoder" in models_for_export:
+        text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=pipeline.text_encoder, exporter="onnx", task="feature-extraction"
+        )
+        text_encoder_onnx_config = text_encoder_config_constructor(pipeline.text_encoder.config)
+        models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_onnx_config)
 
     # U-NET
     onnx_config_constructor = TasksManager.get_exporter_config_constructor(
@@ -278,6 +294,16 @@ def get_stable_diffusion_models_for_export(
     vae_onnx_config = vae_config_constructor(vae_decoder.config)
     models_for_export["vae_decoder"] = (vae_decoder, vae_onnx_config)
 
+    if "text_encoder_2" in models_for_export:
+        onnx_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=pipeline.text_encoder_2,
+            exporter="onnx",
+            task="feature-extraction",
+            model_type="clip-text-with-projection",
+        )
+        onnx_config = onnx_config_constructor(pipeline.text_encoder_2.config)
+        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], onnx_config)
+
     return models_for_export
 
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index b676ddb9b1..2f3c432968 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -171,6 +171,7 @@ class TasksManager:
             "audio-xvector": "AutoModelForAudioXVector",
             "image-to-text": "AutoModelForVision2Seq",
             "stable-diffusion": "StableDiffusionPipeline",
+            "stable-diffusion-xl": "StableDiffusionXLPipeline",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
@@ -267,6 +268,7 @@ class TasksManager:
         "image-to-text": "transformers",
         "sentence-similarity": "transformers",
         "stable-diffusion": "diffusers",
+        "stable-diffusion-xl": "diffusers",
         "summarization": "transformers",
         "visual-question-answering": "transformers",
         "zero-shot-classification": "transformers",
@@ -390,6 +392,10 @@ class TasksManager:
             "feature-extraction",
             onnx="CLIPTextOnnxConfig",
         ),
+        "clip-text-with-projection": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="CLIPTextWithProjectionOnnxConfig",
+        ),
         "codegen": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -931,7 +937,14 @@ class TasksManager:
             onnx="YolosOnnxConfig",
         ),
     }
-    _UNSUPPORTED_CLI_MODEL_TYPE = {"unet", "vae-encoder", "vae-decoder", "clip-text-model", "trocr"}
+    _UNSUPPORTED_CLI_MODEL_TYPE = {
+        "unet",
+        "vae-encoder",
+        "vae-decoder",
+        "clip-text-model",
+        "clip-text-with-projection",
+        "trocr",
+    }
     _SUPPORTED_CLI_MODEL_TYPE = set(_SUPPORTED_MODEL_TYPE.keys()) - _UNSUPPORTED_CLI_MODEL_TYPE
 
     @classmethod
@@ -1006,7 +1019,7 @@ def get_supported_tasks_for_model_type(
         if model_type not in TasksManager._SUPPORTED_MODEL_TYPE:
             raise KeyError(
                 f"{model_type_and_model_name} is not supported yet. "
-                f"Only {TasksManager._SUPPORTED_CLI_MODEL_TYPE} are supported. "
+                f"Only {TasksManager._SUPPORTED_MODEL_TYPE} are supported. "
                 f"If you want to support {model_type} please propose a PR or open up an issue."
             )
         elif exporter not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]:
@@ -1271,7 +1284,7 @@ def _infer_task_from_model_or_model_class(
                 (
                     target_name.startswith("Auto"),
                     target_name.startswith("TFAuto"),
-                    target_name == "StableDiffusionPipeline",
+                    "StableDiffusion" in target_name,
                 )
             ):
                 if target_name == auto_cls_name:
@@ -1314,8 +1327,10 @@ def _infer_task_from_model_name_or_path(
             model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
             if model_info.library_name == "diffusers":
                 # TODO : getattr(model_info, "model_index") defining auto_model_class_name currently set to None
-                if "stable-diffusion" in model_info.tags:
-                    inferred_task_name = "stable-diffusion"
+                for task in ("stable-diffusion-xl", "stable-diffusion"):
+                    if task in model_info.tags:
+                        inferred_task_name = task
+                        break
             else:
                 pipeline_tag = getattr(model_info, "pipeline_tag", None)
                 # conversational is not a supported task per se, just an alias that may map to
@@ -1476,7 +1491,11 @@ def get_model_from_task(
                 elif device is None:
                     device = torch.device("cpu")
 
-                if version.parse(torch.__version__) >= version.parse("2.0"):
+                # TODO : fix EulerDiscreteScheduler loading to enable for SD models
+                if (
+                    version.parse(torch.__version__) >= version.parse("2.0")
+                    and TasksManager._TASKS_TO_LIBRARY[task.replace("-with-past", "")] != "diffusers"
+                ):
                     with device:
                         # Initialize directly in the requested device, to save allocation time. Especially useful for large
                         # models to initialize on cuda device.
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index e5904185c2..62e32cfe71 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -71,12 +71,16 @@
         "ORTStableDiffusionPipeline",
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionXLImg2ImgPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
         "ORTStableDiffusionPipeline",
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionXLImg2ImgPipeline",
     ]
 
 
@@ -124,12 +128,16 @@
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLPipeline,
         )
     else:
         from .modeling_diffusion import (
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLPipeline,
         )
 else:
     import sys
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 51e0a85a3f..3541ad9480 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -28,6 +28,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME
@@ -41,7 +42,10 @@
 from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
 from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
 from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
 from ..utils import (
+    DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
@@ -77,6 +81,8 @@ def __init__(
         scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
         feature_extractor: Optional[CLIPFeatureExtractor] = None,
         vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_2_session: Optional[ort.InferenceSession] = None,
+        tokenizer_2: Optional[CLIPTokenizer] = None,
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
     ):
@@ -114,11 +120,16 @@ def __init__(
         self._internal_dict = config
         self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self)
         self.vae_decoder_model_path = Path(vae_decoder_session._model_path)
-        self.text_encoder = ORTModelTextEncoder(text_encoder_session, self)
-        self.text_encoder_model_path = Path(text_encoder_session._model_path)
         self.unet = ORTModelUnet(unet_session, self)
         self.unet_model_path = Path(unet_session._model_path)
 
+        if text_encoder_session is not None:
+            self.text_encoder_model_path = Path(text_encoder_session._model_path)
+            self.text_encoder = ORTModelTextEncoder(text_encoder_session, self)
+        else:
+            self.text_encoder_model_path = None
+            self.text_encoder = None
+
         if vae_encoder_session is not None:
             self.vae_encoder_model_path = Path(vae_encoder_session._model_path)
             self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self)
@@ -126,7 +137,15 @@ def __init__(
             self.vae_encoder_model_path = None
             self.vae_encoder = None
 
+        if text_encoder_2_session is not None:
+            self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path)
+            self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self)
+        else:
+            self.text_encoder_2_model_path = None
+            self.text_encoder_2 = None
+
         self.tokenizer = tokenizer
+        self.tokenizer_2 = tokenizer_2
         self.scheduler = scheduler
         self.feature_extractor = feature_extractor
         self.safety_checker = None
@@ -136,6 +155,7 @@ def __init__(
             DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
             DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
             DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
+            DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
         }
 
         # Modify config to keep the resulting model compatible with diffusers pipelines
@@ -156,6 +176,7 @@ def load_model(
         text_encoder_path: Union[str, Path],
         unet_path: Union[str, Path],
         vae_encoder_path: Optional[Union[str, Path]] = None,
+        text_encoder_2_path: Optional[Union[str, Path]] = None,
         provider: str = "CPUExecutionProvider",
         session_options: Optional[ort.SessionOptions] = None,
         provider_options: Optional[Dict] = None,
@@ -173,6 +194,8 @@ def load_model(
                 The path to the U-NET ONNX model.
             vae_encoder_path (`Union[str, Path]`, defaults to `None`):
                 The path to the VAE encoder ONNX model.
+            text_encoder_2_path (`Union[str, Path]`, defaults to `None`):
+                The path to the second text decoder ONNX model.
             provider (`str`, defaults to `"CPUExecutionProvider"`):
                 ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/
                 for possible providers.
@@ -182,16 +205,22 @@ def load_model(
                 Provider option dictionary corresponding to the provider used. See available options
                 for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`.
         """
-        vae_decoder_session = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options)
-        text_encoder_session = ORTModel.load_model(text_encoder_path, provider, session_options, provider_options)
-        unet_session = ORTModel.load_model(unet_path, provider, session_options, provider_options)
+        vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options)
+        unet = ORTModel.load_model(unet_path, provider, session_options, provider_options)
 
-        if vae_encoder_path is not None:
-            vae_encoder_session = ORTModel.load_model(vae_encoder_path, provider, session_options, provider_options)
-        else:
-            vae_encoder_session = None
+        sessions = {
+            "vae_encoder": vae_encoder_path,
+            "text_encoder": text_encoder_path,
+            "text_encoder_2": text_encoder_2_path,
+        }
+
+        for key, value in sessions.items():
+            if value is not None and value.is_file():
+                sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options)
+            else:
+                sessions[key] = None
 
-        return vae_decoder_session, text_encoder_session, unet_session, vae_encoder_session
+        return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"]
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         save_directory = Path(save_directory)
@@ -201,10 +230,13 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME,
         }
 
-        if self.vae_encoder_model_path is not None:
-            src_to_dst_path[self.vae_encoder_model_path] = (
-                save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME
-            )
+        sub_models_to_save = {
+            self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
+            self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+        }
+        for path, subfolder in sub_models_to_save.items():
+            if path is not None:
+                src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME
 
         # TODO: Modify _get_external_data_paths to give dictionnary
         src_paths = list(src_to_dst_path.keys())
@@ -219,10 +251,14 @@ def _save_pretrained(self, save_directory: Union[str, Path]):
             if config_path.is_file():
                 shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name)
 
-        self.tokenizer.save_pretrained(save_directory / "tokenizer")
         self.scheduler.save_pretrained(save_directory / "scheduler")
+
         if self.feature_extractor is not None:
             self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory / "tokenizer")
+        if self.tokenizer_2 is not None:
+            self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")
 
     @classmethod
     def _from_pretrained(
@@ -236,6 +272,7 @@ def _from_pretrained(
         text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         unet_file_name: str = ONNX_WEIGHTS_NAME,
         vae_encoder_file_name: str = ONNX_WEIGHTS_NAME,
+        text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME,
         local_files_only: bool = False,
         provider: str = "CPUExecutionProvider",
         session_options: Optional[ort.SessionOptions] = None,
@@ -248,12 +285,10 @@ def _from_pretrained(
             raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported")
 
         model_id = str(model_id)
-        sub_models_to_load, _, _ = cls.extract_init_dict(config)
-        sub_models_names = set(sub_models_to_load.keys()).intersection({"feature_extractor", "tokenizer", "scheduler"})
-        sub_models = {}
+        patterns = set(config.keys())
+        sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"})
 
         if not os.path.isdir(model_id):
-            patterns = set(config.keys())
             patterns.update({"vae_encoder", "vae_decoder"})
             allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
             allow_patterns.update(
@@ -262,6 +297,7 @@ def _from_pretrained(
                     text_encoder_file_name,
                     unet_file_name,
                     vae_encoder_file_name,
+                    text_encoder_2_file_name,
                     SCHEDULER_CONFIG_NAME,
                     CONFIG_NAME,
                     cls.config_name,
@@ -279,8 +315,9 @@ def _from_pretrained(
             )
         new_model_save_dir = Path(model_id)
 
-        for name in sub_models_names:
-            library_name, library_classes = sub_models_to_load[name]
+        sub_models = {}
+        for name in sub_models_to_load:
+            library_name, library_classes = config[name]
             if library_classes is not None:
                 library = importlib.import_module(library_name)
                 class_obj = getattr(library, library_classes)
@@ -291,18 +328,14 @@ def _from_pretrained(
                 else:
                     sub_models[name] = load_method(new_model_save_dir)
 
-        vae_encoder_path = new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name
-
-        if not vae_encoder_path.is_file():
-            logger.warning(
-                f"VAE encoder not found in {model_id} and will not be loaded for inference. This component is needed for some tasks."
-            )
-
-        inference_sessions = cls.load_model(
+        vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model(
             vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
             text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
             unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
-            vae_encoder_path=vae_encoder_path if vae_encoder_path.is_file() else None,
+            vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
+            text_encoder_2_path=new_model_save_dir
+            / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER
+            / text_encoder_2_file_name,
             provider=provider,
             session_options=session_options,
             provider_options=provider_options,
@@ -317,12 +350,16 @@ def _from_pretrained(
             )
 
         return cls(
-            *inference_sessions[:-1],
+            vae_decoder_session=vae_decoder,
+            text_encoder_session=text_encoder,
+            unet_session=unet,
             config=config,
-            tokenizer=sub_models["tokenizer"],
-            scheduler=sub_models["scheduler"],
-            feature_extractor=sub_models.pop("feature_extractor", None),
-            vae_encoder_session=inference_sessions[-1],
+            tokenizer=sub_models.get("tokenizer", None),
+            scheduler=sub_models.get("scheduler"),
+            feature_extractor=sub_models.get("feature_extractor", None),
+            tokenizer_2=sub_models.get("tokenizer_2", None),
+            vae_encoder_session=vae_encoder,
+            text_encoder_2_session=text_encoder_2,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
         )
@@ -426,6 +463,7 @@ def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         config_path = Path(session._model_path).parent / self.CONFIG_NAME
         self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
+        self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()}
 
     @property
     def device(self):
@@ -451,14 +489,26 @@ def forward(self, input_ids: np.ndarray):
 class ORTModelUnet(_ORTDiffusionModelPart):
     def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
         super().__init__(session, parent_model)
-        self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()}
 
-    def forward(self, sample: np.ndarray, timestep: np.ndarray, encoder_hidden_states: np.ndarray):
+    def forward(
+        self,
+        sample: np.ndarray,
+        timestep: np.ndarray,
+        encoder_hidden_states: np.ndarray,
+        text_embeds: Optional[np.ndarray] = None,
+        time_ids: Optional[np.ndarray] = None,
+    ):
         onnx_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
         }
+
+        if text_embeds is not None:
+            onnx_inputs["text_embeds"] = text_embeds
+        if time_ids is not None:
+            onnx_inputs["time_ids"] = time_ids
+
         outputs = self.session.run(None, onnx_inputs)
         return outputs
 
@@ -494,3 +544,52 @@ def __call__(self, *args, **kwargs):
 class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
     def __call__(self, *args, **kwargs):
         return StableDiffusionInpaintPipelineMixin.__call__(self, *args, **kwargs)
+
+
+class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
+    auto_model_class = StableDiffusionXLPipeline
+
+    def __init__(
+        self,
+        vae_decoder_session: ort.InferenceSession,
+        text_encoder_session: ort.InferenceSession,
+        unet_session: ort.InferenceSession,
+        config: Dict[str, Any],
+        tokenizer: CLIPTokenizer,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        feature_extractor: Optional[CLIPFeatureExtractor] = None,
+        vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_2_session: Optional[ort.InferenceSession] = None,
+        tokenizer_2: Optional[CLIPTokenizer] = None,
+        use_io_binding: Optional[bool] = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+    ):
+        super().__init__(
+            vae_decoder_session=vae_decoder_session,
+            text_encoder_session=text_encoder_session,
+            unet_session=unet_session,
+            config=config,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            vae_encoder_session=vae_encoder_session,
+            text_encoder_2_session=text_encoder_2_session,
+            tokenizer_2=tokenizer_2,
+            use_io_binding=use_io_binding,
+            model_save_dir=model_save_dir,
+        )
+
+        # additional invisible-watermark dependency for SD XL
+        from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker
+
+        self.watermark = StableDiffusionXLWatermarker()
+
+
+class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin):
+    def __call__(self, *args, **kwargs):
+        return StableDiffusionXLPipelineMixin.__call__(self, *args, **kwargs)
+
+
+class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
+    def __call__(self, *args, **kwargs):
+        return StableDiffusionXLImg2ImgPipelineMixin.__call__(self, *args, **kwargs)
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 54c3143cc1..1784766c6a 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -170,7 +170,6 @@ class ORTModel(OptimizedModel):
     @classproperty
     def export_feature(cls):
         logger.warning(f"{cls.__name__}.export_feature is deprecated, and will be removed in optimum 2.0.")
-
         try:
             feature = TasksManager.infer_task_from_model(cls.auto_model_class)
         except ValueError:
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
index 5d46668ec1..c133f8c6d2 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -20,7 +20,7 @@
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 
-from .pipeline_utils import DiffusionPipelineMixin
+from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
 
 
 logger = logging.getLogger(__name__)
@@ -179,12 +179,31 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = generator.randn(*shape).astype(dtype)
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        return latents
+
     # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -198,6 +217,7 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
         callback_steps: int = 1,
+        guidance_rescale: float = 0.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -206,9 +226,9 @@ def __call__(
             prompt (`Optional[Union[str, List[str]]]`, defaults to None):
                 The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                 instead.
-            height (`int`, defaults to 512):
+            height (`Optional[int]`, defaults to None):
                 The height in pixels of the generated image.
-            width (`int`, defaults to 512):
+            width (`Optional[int]`, defaults to None):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -253,6 +273,11 @@ def __call__(
             callback_steps (`int`, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
+            guidance_rescale (`float`, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
 
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
@@ -261,6 +286,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
 
         # check inputs. Raise error if not correct
         self.check_inputs(
@@ -292,25 +319,19 @@ def __call__(
             negative_prompt_embeds=negative_prompt_embeds,
         )
 
-        num_unet_in_channels = self.unet.config.get("in_channels", 4)
-        # get the initial random noise unless the user supplied it
-        latents_dtype = prompt_embeds.dtype
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            num_unet_in_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if latents is None:
-            latents = generator.randn(*latents_shape).astype(latents_dtype)
-        elif latents.shape != latents_shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
         timesteps = self.scheduler.timesteps
 
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config.get("in_channels", 4),
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
 
         # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
         # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -340,6 +361,9 @@ def __call__(
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
                 noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
 
             # compute the previous noisy sample x_t -> x_t-1
             scheduler_output = self.scheduler.step(
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
index ca99ed0469..d2c23b2b04 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -14,50 +14,21 @@
 
 import inspect
 import logging
-import warnings
 from typing import Callable, List, Optional, Union
 
 import numpy as np
 import PIL
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import PIL_INTERPOLATION, deprecate
+from diffusers.utils import deprecate
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
+from .pipeline_utils import preprocess
 
 
 logger = logging.getLogger(__name__)
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
-def preprocess(image):
-    warnings.warn(
-        (
-            "The preprocess method is deprecated and will be removed in a future version. Please"
-            " use VaeImageProcessor.preprocess instead"
-        ),
-        FutureWarning,
-    )
-    if isinstance(image, torch.Tensor):
-        return image
-    elif isinstance(image, PIL.Image.Image):
-        image = [image]
-
-    if isinstance(image[0], PIL.Image.Image):
-        w, h = image[0].size
-        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
-
-        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
-        image = np.concatenate(image, axis=0)
-        image = np.array(image).astype(np.float32) / 255.0
-        image = image.transpose(0, 3, 1, 2)
-        image = 2.0 * image - 1.0
-        image = torch.from_numpy(image)
-    elif isinstance(image[0], torch.Tensor):
-        image = torch.cat(image, dim=0)
-    return image
-
-
 class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin):
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs
     def check_inputs(
@@ -207,7 +178,7 @@ def __call__(
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
-        image = preprocess(image).cpu().numpy()
+        image = preprocess(image)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
index 6a5c3accdc..07a808acab 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -105,8 +105,8 @@ def __call__(
         prompt: Union[str, List[str]],
         image: PIL.Image.Image,
         mask_image: PIL.Image.Image,
-        height: int = 512,
-        width: int = 512,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -132,9 +132,9 @@ def __call__(
                 `Image`, or tensor representing an image batch which will be upscaled.
             mask_image (`PIL.Image.Image`):
                 `Image`, or tensor representing a masked image batch which will be upscaled.
-            height (`int`, defaults to 512):
+            height (`Optional[int]`, defaults to None):
                 The height in pixels of the generated image.
-            width (`int`, defaults to 512):
+            width (`Optional[int]`, defaults to None):
                 The width in pixels of the generated image.
             num_inference_steps (`int`, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
@@ -187,6 +187,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
 
         # check inputs. Raise error if not correct
         self.check_inputs(
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
new file mode 100644
index 0000000000..4c8c015fed
--- /dev/null
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -0,0 +1,499 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import inspect
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+
+from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
+
+
+logger = logging.getLogger(__name__)
+
+
+class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin):
+    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, list]],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        pooled_prompt_embeds: Optional[np.ndarray] = None,
+        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`Union[str, List[str]]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                # get prompt text embeddings
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="np",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
+                )
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds[-2]
+                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = np.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="np",
+                )
+                negative_prompt_embeds = text_encoder(
+                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
+                )
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[-2]
+
+                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = np.concatenate(negative_prompt_embeds, axis=-1)
+
+        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
+        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        pooled_prompt_embeds: Optional[np.ndarray] = None,
+        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = generator.randn(*shape).astype(dtype)
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        return latents
+
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        extra_step_kwargs = {}
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        return extra_step_kwargs
+
+    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        pooled_prompt_embeds: Optional[np.ndarray] = None,
+        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
+        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.unet.config.get("in_channels", 4),
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids = (original_size + crops_coords_top_left + target_size,)
+        add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype)
+
+        if do_classifier_free_guidance:
+            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
+            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
+            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
+        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
+
+        # Adapted from diffusers to extend it for other runtimes than ORT
+        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                text_embeds=add_text_embeds,
+                time_ids=add_time_ids,
+            )
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215)
+            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+            image = np.concatenate(
+                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+            )
+            image = self.watermark.apply_watermark(image)
+
+            # TODO: add image_processor
+            image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1))
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
new file mode 100644
index 0000000000..7be02dc5cb
--- /dev/null
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -0,0 +1,506 @@
+#  Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import inspect
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+
+from .pipeline_utils import DiffusionPipelineMixin, preprocess, rescale_noise_cfg
+
+
+logger = logging.getLogger(__name__)
+
+
+class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin):
+    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[Union[str, list]],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        pooled_prompt_embeds: Optional[np.ndarray] = None,
+        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`Union[str, List[str]]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                # get prompt text embeddings
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="np",
+                )
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
+                )
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds[-2]
+                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = np.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="np",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
+                )
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[-2]
+                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+            negative_prompt_embeds = np.concatenate(negative_prompt_embeds, axis=-1)
+
+        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
+        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        strength: float,
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy()
+
+        return timesteps, num_inference_steps - t_start
+
+    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+        else:
+            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents], axis=0)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep)
+        )
+        return init_latents.numpy()
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
+    ):
+        if self.config.get("requires_aesthetics_score"):
+            add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),)
+            add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),)
+        else:
+            add_time_ids = (original_size + crops_coords_top_left + target_size,)
+            add_neg_time_ids = (original_size + crops_coords_top_left + target_size,)
+
+        add_time_ids = np.array(add_time_ids, dtype=dtype)
+        add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        pooled_prompt_embeds: Optional[np.ndarray] = None,
+        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`Union[np.ndarray, PIL.Image.Image]`):
+                `Image`, or tensor representing an image batch which will be upscaled.
+            strength (`float`, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 1. Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        )
+
+        # 3. Preprocess image
+        image = preprocess(image)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+        latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0)
+        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
+
+        latents_dtype = prompt_embeds.dtype
+        image = image.astype(latents_dtype)
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = {}
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            dtype=prompt_embeds.dtype,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
+            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
+            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
+        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                text_embeds=add_text_embeds,
+                time_ids=add_time_ids,
+            )
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                if guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, latents)
+
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215)
+            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+            image = np.concatenate(
+                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+            )
+            image = self.watermark.apply_watermark(image)
+
+            # TODO: add image_processor
+            image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1))
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py
index 7092003875..27cc684cb3 100644
--- a/optimum/pipelines/diffusers/pipeline_utils.py
+++ b/optimum/pipelines/diffusers/pipeline_utils.py
@@ -13,7 +13,13 @@
 #  limitations under the License.
 
 
+import warnings
+
+import numpy as np
+import PIL
+import torch
 from diffusers import ConfigMixin
+from diffusers.utils import PIL_INTERPOLATION
 from PIL import Image
 from tqdm.auto import tqdm
 
@@ -51,3 +57,46 @@ def progress_bar(self, iterable=None, total=None):
             return tqdm(total=total, **self._progress_bar_config)
         else:
             raise ValueError("Either `total` or `iterable` has to be defined.")
+
+
+# Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
+def preprocess(image):
+    warnings.warn(
+        (
+            "The preprocess method is deprecated and will be removed in a future version. Please"
+            " use VaeImageProcessor.preprocess instead"
+        ),
+        FutureWarning,
+    )
+    if isinstance(image, torch.Tensor):
+        return image.cpu().numpy()
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0).cpu().numpy()
+    return image
+
+
+# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True)
+    std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py
new file mode 100644
index 0000000000..e07b4829c6
--- /dev/null
+++ b/optimum/pipelines/diffusers/watermark.py
@@ -0,0 +1,27 @@
+import numpy as np
+from imwatermark import WatermarkEncoder
+
+
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+
+
+# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12
+class StableDiffusionXLWatermarker:
+    def __init__(self):
+        self.watermark = WATERMARK_BITS
+        self.encoder = WatermarkEncoder()
+        self.encoder.set_watermark("bits", self.watermark)
+
+    def apply_watermark(self, images: np.array):
+        # can't encode images that are smaller than 256
+        if images.shape[-1] < 256:
+            return images
+
+        images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1))
+
+        images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2))
+
+        np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images)
+
+        return images
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 3042721938..df0db3f39a 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -15,6 +15,7 @@
 
 from .constant import (
     CONFIG_NAME,
+    DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
diff --git a/optimum/utils/constant.py b/optimum/utils/constant.py
index 2750d1190d..4497b5246d 100644
--- a/optimum/utils/constant.py
+++ b/optimum/utils/constant.py
@@ -18,4 +18,5 @@
 DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
 DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder"
 DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder"
+DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER = "text_encoder_2"
 ONNX_WEIGHTS_NAME = "model.onnx"
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index a6171d5317..f85a0987d4 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -46,3 +46,25 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
+
+
+class ORTStableDiffusionXLPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTStableDiffusionXLImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 9d78eccd82..5e6049bd41 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -34,7 +34,7 @@
 
 TORCH_MINIMUM_VERSION = packaging.version.parse("1.11.0")
 TRANSFORMERS_MINIMUM_VERSION = packaging.version.parse("4.25.0")
-DIFFUSERS_MINIMUM_VERSION = packaging.version.parse("0.17.0")
+DIFFUSERS_MINIMUM_VERSION = packaging.version.parse("0.18.0")
 
 
 # This is the minimal required version to support some ONNX Runtime features
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 30c79052e6..d88f21fd2b 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -605,7 +605,11 @@ class DummyTimestepInputGenerator(DummyInputGenerator):
     Generates dummy time step inputs.
     """
 
-    SUPPORTED_INPUT_NAMES = ("timestep",)
+    SUPPORTED_INPUT_NAMES = (
+        "timestep",
+        "text_embeds",
+        "time_ids",
+    )
 
     def __init__(
         self,
@@ -617,7 +621,7 @@ def __init__(
     ):
         self.task = task
         self.vocab_size = normalized_config.vocab_size
-
+        self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim
         if random_batch_size_range:
             low, high = random_batch_size_range
             self.batch_size = random.randint(low, high)
@@ -626,7 +630,12 @@ def __init__(
 
     def generate(self, input_name: str, framework: str = "pt"):
         shape = [self.batch_size]
-        return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework)
+
+        if input_name == "timestep":
+            return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework)
+
+        shape.append(self.text_encoder_projection_dim if input_name == "text_embeds" else 6)
+        return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework)
 
 
 class DummyLabelsGenerator(DummyInputGenerator):
diff --git a/setup.py b/setup.py
index b310fff0ae..7da8e334da 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,7 @@
     "diffusers>=0.17.0",
     "torchaudio",
     "einops",
+    "invisible-watermark",
 ]
 
 QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241,<=0.0.259"]
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index c28613c793..423875ca28 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -237,5 +237,6 @@
 }
 
 PYTORCH_STABLE_DIFFUSION_MODEL = {
-    ("hf-internal-testing/tiny-stable-diffusion-torch"),
+    "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
 }
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index 39342cb4d5..a92a5d1881 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -27,12 +27,13 @@
 from optimum.exporters.error_utils import MinimumVersionError
 from optimum.exporters.onnx.__main__ import main_export
 from optimum.onnxruntime import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME
+from optimum.utils.testing_utils import require_diffusers
 
 
 if is_torch_available():
     from optimum.exporters.tasks import TasksManager
 
-from ..exporters_utils import PYTORCH_EXPORT_MODELS_TINY
+from ..exporters_utils import PYTORCH_EXPORT_MODELS_TINY, PYTORCH_STABLE_DIFFUSION_MODEL
 
 
 def _get_models_to_test(export_models_dict: Dict):
@@ -134,6 +135,31 @@ def test_all_models_tested(self):
         if len(missing_models_set) > 0:
             self.fail(f"Not testing all models. Missing models: {missing_models_set}")
 
+    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @require_torch
+    @require_vision
+    @require_diffusers
+    def test_exporters_cli_pytorch_cpu_stable_diffusion(self, model_type: str, model_name: str):
+        self._onnx_export(model_name, model_type)
+
+    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @require_torch_gpu
+    @require_vision
+    @require_diffusers
+    @slow
+    @pytest.mark.run_slow
+    def test_exporters_cli_pytorch_gpu_stable_diffusion(self, model_type: str, model_name: str):
+        self._onnx_export(model_name, model_type, device="cuda")
+
+    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @require_torch_gpu
+    @require_vision
+    @require_diffusers
+    @slow
+    @pytest.mark.run_slow
+    def test_exporters_cli_fp16_stable_diffusion(self, model_type: str, model_name: str):
+        self._onnx_export(model_name, model_type, device="cuda", fp16=True)
+
     @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS_TINY))
     @require_torch
     @require_vision
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index 15d4bec7c6..c97c9ff58c 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -23,7 +23,7 @@
 import onnx
 import pytest
 from parameterized import parameterized
-from transformers import AutoConfig, is_tf_available, is_torch_available, set_seed
+from transformers import AutoConfig, is_tf_available, is_torch_available
 from transformers.testing_utils import require_onnx, require_tf, require_torch, require_torch_gpu, require_vision, slow
 
 from optimum.exporters.error_utils import AtolError
@@ -40,7 +40,7 @@
 from optimum.exporters.onnx.base import ConfigBehavior
 from optimum.exporters.onnx.config import TextDecoderOnnxConfig
 from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
-from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig, is_diffusers_available
+from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
 from ..exporters_utils import (
@@ -54,9 +54,6 @@
 if is_torch_available() or is_tf_available():
     from optimum.exporters.tasks import TasksManager
 
-if is_diffusers_available():
-    from diffusers import StableDiffusionPipeline
-
 
 SEED = 42
 
@@ -314,6 +311,30 @@ def _onnx_export(
 
                 gc.collect()
 
+    def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"):
+        pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device)
+        models_and_onnx_configs = get_stable_diffusion_models_for_export(pipeline)
+        output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
+        model, _ = models_and_onnx_configs["vae_encoder"]
+        model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters}
+
+        with TemporaryDirectory() as tmpdirname:
+            _, onnx_outputs = export_models(
+                models_and_onnx_configs=models_and_onnx_configs,
+                opset=14,
+                output_dir=Path(tmpdirname),
+                output_names=output_names,
+                device=device,
+            )
+            validate_models_outputs(
+                models_and_onnx_configs=models_and_onnx_configs,
+                onnx_named_outputs=onnx_outputs,
+                output_dir=Path(tmpdirname),
+                atol=1e-3,
+                onnx_files_subpaths=output_names,
+                use_subprocess=False,
+            )
+
     def test_all_models_tested(self):
         # make sure we test all models
         missing_models_set = TasksManager._SUPPORTED_CLI_MODEL_TYPE - set(PYTORCH_EXPORT_MODELS_TINY.keys())
@@ -383,40 +404,23 @@ def test_tensorflow_export(self, test_name, name, model_name, task, onnx_config_
 
         self._onnx_export(test_name, name, model_name, task, onnx_config_class_constructor, monolith=monolith)
 
-    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL)
+    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
     @require_torch
     @require_vision
     @require_diffusers
-    def test_pytorch_export_for_stable_diffusion_models(self, model_name):
-        set_seed(SEED)
-
-        pipeline = StableDiffusionPipeline.from_pretrained(model_name)
-        output_names = [
-            "text_encoder/model.onnx",
-            "unet/model.onnx",
-            "vae_encoder/model.onnx",
-            "vae_decoder/model.onnx",
-        ]
-        models_and_onnx_configs = get_stable_diffusion_models_for_export(pipeline)
-        model, _ = models_and_onnx_configs["vae_encoder"]
-        model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters}
+    def test_pytorch_export_for_stable_diffusion_models(self, model_type, model_name):
+        self._onnx_export_sd(model_type, model_name)
 
-        with TemporaryDirectory() as tmpdirname:
-            _, onnx_outputs = export_models(
-                models_and_onnx_configs=models_and_onnx_configs,
-                opset=14,
-                output_dir=Path(tmpdirname),
-                output_names=output_names,
-                device="cpu",  # TODO: Add GPU test
-            )
-            validate_models_outputs(
-                models_and_onnx_configs=models_and_onnx_configs,
-                onnx_named_outputs=onnx_outputs,
-                output_dir=Path(tmpdirname),
-                atol=1e-3,
-                onnx_files_subpaths=output_names,
-                use_subprocess=False,
-            )
+    @parameterized.expand(PYTORCH_STABLE_DIFFUSION_MODEL.items())
+    @require_torch
+    @require_vision
+    @require_diffusers
+    @require_torch_gpu
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.gpu_test
+    def test_pytorch_export_for_stable_diffusion_models_cuda(self, model_type, model_name):
+        self._onnx_export_sd(model_type, model_name, device="cuda")
 
 
 class CustomWhisperOnnxConfig(WhisperOnnxConfig):
diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py
index aba1df44c5..e7b3bc5ec6 100644
--- a/tests/onnxruntime/test_stable_diffusion_pipeline.py
+++ b/tests/onnxruntime/test_stable_diffusion_pipeline.py
@@ -22,6 +22,7 @@
 from diffusers import (
     OnnxStableDiffusionImg2ImgPipeline,
     StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
 )
 from diffusers.utils import floats_tensor, load_image
 from parameterized import parameterized
@@ -36,6 +37,8 @@
     ORTModelVaeEncoder,
     ORTStableDiffusionImg2ImgPipeline,
     ORTStableDiffusionInpaintPipeline,
+    ORTStableDiffusionXLImg2ImgPipeline,
+    ORTStableDiffusionXLPipeline,
 )
 from optimum.utils import logging
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
@@ -179,20 +182,24 @@ def test_compare_to_diffusers(self, model_arch: str):
 
         pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
         pipeline.safety_checker = None
-        num_images_per_prompt, height, width = 1, 64, 64
-        latents_shape = (
-            num_images_per_prompt,
+        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 64
+
+        latents = ort_pipeline.prepare_latents(
+            batch_size * num_images_per_prompt,
             ort_pipeline.unet.config["in_channels"],
-            height // ort_pipeline.vae_scale_factor,
-            width // ort_pipeline.vae_scale_factor,
+            height,
+            width,
+            dtype=np.float32,
+            generator=np.random.RandomState(0),
         )
-        latents = np.random.randn(*latents_shape).astype(np.float32)
+
         kwargs = {
             "prompt": "sailing ship in storm by Leonardo da Vinci",
             "num_inference_steps": 1,
             "num_images_per_prompt": num_images_per_prompt,
             "height": height,
             "width": width,
+            "guidance_rescale": 0.1,
         }
 
         for output_type in ["latent", "np"]:
@@ -222,6 +229,71 @@ def test_image_reproducibility(self, model_arch: str):
         self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
 
 
+class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion-xl",
+    ]
+    ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
+    TASK = "stable-diffusion-xl"
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers(self, model_arch: str):
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
+        self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
+        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
+        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
+        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
+        self.assertIsInstance(ort_pipeline.config, Dict)
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 64
+        latents = ort_pipeline.prepare_latents(
+            batch_size * num_images_per_prompt,
+            ort_pipeline.unet.config["in_channels"],
+            height,
+            width,
+            dtype=np.float32,
+            generator=np.random.RandomState(0),
+        )
+
+        kwargs = {
+            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+            "num_inference_steps": 1,
+            "num_images_per_prompt": num_images_per_prompt,
+            "height": height,
+            "width": width,
+            "guidance_rescale": 0.1,
+        }
+
+        for output_type in ["latent", "np"]:
+            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
+            self.assertIsInstance(ort_outputs, np.ndarray)
+            with torch.no_grad():
+                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+
+            # Compare model outputs
+            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
+            # Compare model devices
+            self.assertEqual(pipeline.device, ort_pipeline.device)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
+        inputs = _generate_inputs()
+        height = 64
+        width = 64
+        np.random.seed(0)
+        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
+        np.random.seed(0)
+        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
+        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
+        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
+        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+
+
 class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
     SUPPORTED_ARCHITECTURES = [
         "stable-diffusion",
@@ -262,3 +334,33 @@ def generate_inputs(self, height=128, width=128):
         ).resize((64, 64))
 
         return inputs
+
+
+class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion-xl",
+    ]
+    ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
+    TASK = "stable-diffusion-xl"
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_inference(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        inputs = self.generate_inputs()
+        output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
+        expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
+
+        self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
+
+    def generate_inputs(self, height=128, width=128):
+        inputs = _generate_inputs()
+        inputs["image"] = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        ).resize((height, width))
+
+        inputs["strength"] = 0.75
+        return inputs
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 88a43d5590..f83acd91e6 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -79,6 +79,7 @@
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
     "t5": "hf-internal-testing/tiny-random-t5",
     "vit": "hf-internal-testing/tiny-random-vit",

From f26e5b18bfdc02e30830cf7270a244e1d644e167 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 18 Jul 2023 14:06:44 +0200
Subject: [PATCH 02/20] fix SD XL ONNX export for img2img task (#1194)

* fix SD XL ONNX export for img2img task

* fix pipeline
---
 optimum/exporters/onnx/utils.py                          | 9 ++++++---
 optimum/exporters/tasks.py                               | 2 +-
 optimum/onnxruntime/modeling_diffusion.py                | 4 ++--
 .../diffusers/pipeline_stable_diffusion_xl_img2img.py    | 2 +-
 optimum/utils/input_generators.py                        | 3 ++-
 5 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index c1bee9a4da..24a809a977 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -100,17 +100,17 @@ def _get_submodels_for_export_stable_diffusion(
     """
     Returns the components of a Stable Diffusion model.
     """
-    from diffusers import StableDiffusionXLPipeline
+    from diffusers import StableDiffusionXLImg2ImgPipeline
 
     models_for_export = {}
-    if isinstance(pipeline, StableDiffusionXLPipeline):
+    if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline):
         projection_dim = pipeline.text_encoder_2.config.projection_dim
     else:
         projection_dim = pipeline.text_encoder.config.projection_dim
 
     # Text encoder
     if pipeline.text_encoder is not None:
-        if isinstance(pipeline, StableDiffusionXLPipeline):
+        if isinstance(pipeline, StableDiffusionXLImg2ImgPipeline):
             pipeline.text_encoder.config.output_hidden_states = True
         models_for_export["text_encoder"] = pipeline.text_encoder
 
@@ -118,6 +118,9 @@ def _get_submodels_for_export_stable_diffusion(
     # PyTorch does not support the ONNX export of torch.nn.functional.scaled_dot_product_attention
     pipeline.unet.set_attn_processor(AttnProcessor())
     pipeline.unet.config.text_encoder_projection_dim = projection_dim
+    # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
+    # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
+    pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
     models_for_export["unet"] = pipeline.unet
 
     # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 2f3c432968..2f1654bbbd 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -171,7 +171,7 @@ class TasksManager:
             "audio-xvector": "AutoModelForAudioXVector",
             "image-to-text": "AutoModelForVision2Seq",
             "stable-diffusion": "StableDiffusionPipeline",
-            "stable-diffusion-xl": "StableDiffusionXLPipeline",
+            "stable-diffusion-xl": "StableDiffusionXLImg2ImgPipeline",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 3541ad9480..8a7b686f53 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -28,7 +28,7 @@
     LMSDiscreteScheduler,
     PNDMScheduler,
     StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
+    StableDiffusionXLImg2ImgPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME
@@ -547,7 +547,7 @@ def __call__(self, *args, **kwargs):
 
 
 class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
-    auto_model_class = StableDiffusionXLPipeline
+    auto_model_class = StableDiffusionXLImg2ImgPipeline
 
     def __init__(
         self,
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
index 7be02dc5cb..4a2b48d38e 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -159,7 +159,7 @@ def _encode_prompt(
                 # Here we concatenate the unconditional and text embeddings into a single batch
                 # to avoid doing two forward passes
                 negative_prompt_embeds_list.append(negative_prompt_embeds)
-            negative_prompt_embeds = np.concatenate(negative_prompt_embeds, axis=-1)
+            negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1)
 
         pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
         negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index d88f21fd2b..d062a29d7e 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -622,6 +622,7 @@ def __init__(
         self.task = task
         self.vocab_size = normalized_config.vocab_size
         self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim
+        self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6
         if random_batch_size_range:
             low, high = random_batch_size_range
             self.batch_size = random.randint(low, high)
@@ -634,7 +635,7 @@ def generate(self, input_name: str, framework: str = "pt"):
         if input_name == "timestep":
             return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework)
 
-        shape.append(self.text_encoder_projection_dim if input_name == "text_embeds" else 6)
+        shape.append(self.text_encoder_projection_dim if input_name == "text_embeds" else self.time_ids)
         return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework)
 
 

From 583d5ab087b9f2f265587fdba7dbf15ae6e2dae1 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 18 Jul 2023 14:07:00 +0200
Subject: [PATCH 03/20] Add SD XL documentation (#1193)

---
 .../package_reference/modeling_ort.mdx        |  9 ++++
 .../onnxruntime/usage_guides/models.mdx       | 51 +++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
index bd6899f574..ebbfa1736e 100644
--- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx
+++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -121,3 +121,12 @@ The following ORT classes are available for the following custom tasks.
 #### ORTStableDiffusionInpaintPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionInpaintPipeline
+
+
+#### ORTStableDiffusionXLPipeline
+
+[[autodoc]] onnxruntime.ORTStableDiffusionXLPipeline
+
+#### ORTStableDiffusionXLImg2ImgPipeline
+
+[[autodoc]] onnxruntime.ORTStableDiffusionXLImg2ImgPipeline
\ No newline at end of file
diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 1098f063c5..634c88fc0b 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -64,7 +64,7 @@ It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to
 ... )
 ```
 
-## Export and inference of sequence-to-sequence models
+## Sequence-to-sequence models
 
 Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models
 are exported to the ONNX format, they are decomposed into three parts that are later combined during inference:
@@ -92,7 +92,7 @@ Here is an example of how you can load a T5 model to the ONNX format and run inf
 >>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
-## Export and inference of Stable Diffusion models
+## Stable Diffusion
 
 Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models
 are exported to the ONNX format, they are split into four components that are later combined during inference:
@@ -104,7 +104,7 @@ are exported to the ONNX format, they are split into four components that are la
 Make sure you have 🤗 Diffusers installed.
 
 To install `diffusers`:
-```
+```bash
 pip install diffusers
 ```
 
@@ -183,3 +183,48 @@ mask_image = download_image(mask_url).resize((512, 512))
 prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
 image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
+
+
+## Stable Diffusion XL
+
+Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
+
+```bash
+pip install diffusers
+pip install invisible-watermark>=2.0
+```
+
+### Text-to-Image
+
+Here is an example of how you can load a PyTorch SD XL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime:
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionXLPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-base-0.9"
+pipeline = ORTStableDiffusionXLPipeline.from_pretrained(model_id, export=True)
+prompt = "sailing ship in storm by Leonardo da Vinci"
+image = pipeline(prompt).images[0]
+
+# Don't forget to save the ONNX model
+save_directory = "a_local_path"
+pipeline.save_pretrained(save_directory)
+
+```
+
+### Image-to-Image
+
+The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). In this case, you only have to output the latents from the base model.
+
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
+
+use_refiner = True
+model_id = "stabilityai/stable-diffusion-xl-refiner-0.9"
+refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+
+image = pipeline(prompt=prompt, output_type="latent" if use_refiner else "pil").images[0]
+image = refiner(prompt=prompt, image=image[None, :]).images[0]
+image.save("sailing_ship.png")
+```

From 7bf37f50d52e2e8cbfb333006c164400917f5baa Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 18 Jul 2023 14:26:52 +0200
Subject: [PATCH 04/20] Remove SD XL documentation (#1197)

Revert "Add SD XL documentation (#1193)"

This reverts commit 583d5ab087b9f2f265587fdba7dbf15ae6e2dae1.
---
 .../package_reference/modeling_ort.mdx        |  9 ----
 .../onnxruntime/usage_guides/models.mdx       | 51 ++-----------------
 2 files changed, 3 insertions(+), 57 deletions(-)

diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
index ebbfa1736e..bd6899f574 100644
--- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx
+++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -121,12 +121,3 @@ The following ORT classes are available for the following custom tasks.
 #### ORTStableDiffusionInpaintPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionInpaintPipeline
-
-
-#### ORTStableDiffusionXLPipeline
-
-[[autodoc]] onnxruntime.ORTStableDiffusionXLPipeline
-
-#### ORTStableDiffusionXLImg2ImgPipeline
-
-[[autodoc]] onnxruntime.ORTStableDiffusionXLImg2ImgPipeline
\ No newline at end of file
diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 634c88fc0b..1098f063c5 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -64,7 +64,7 @@ It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to
 ... )
 ```
 
-## Sequence-to-sequence models
+## Export and inference of sequence-to-sequence models
 
 Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models
 are exported to the ONNX format, they are decomposed into three parts that are later combined during inference:
@@ -92,7 +92,7 @@ Here is an example of how you can load a T5 model to the ONNX format and run inf
 >>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
-## Stable Diffusion
+## Export and inference of Stable Diffusion models
 
 Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models
 are exported to the ONNX format, they are split into four components that are later combined during inference:
@@ -104,7 +104,7 @@ are exported to the ONNX format, they are split into four components that are la
 Make sure you have 🤗 Diffusers installed.
 
 To install `diffusers`:
-```bash
+```
 pip install diffusers
 ```
 
@@ -183,48 +183,3 @@ mask_image = download_image(mask_url).resize((512, 512))
 prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
 image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
-
-
-## Stable Diffusion XL
-
-Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
-
-```bash
-pip install diffusers
-pip install invisible-watermark>=2.0
-```
-
-### Text-to-Image
-
-Here is an example of how you can load a PyTorch SD XL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime:
-
-```python
-from optimum.onnxruntime import ORTStableDiffusionXLPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-base-0.9"
-pipeline = ORTStableDiffusionXLPipeline.from_pretrained(model_id, export=True)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-image = pipeline(prompt).images[0]
-
-# Don't forget to save the ONNX model
-save_directory = "a_local_path"
-pipeline.save_pretrained(save_directory)
-
-```
-
-### Image-to-Image
-
-The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). In this case, you only have to output the latents from the base model.
-
-
-```python
-from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
-
-use_refiner = True
-model_id = "stabilityai/stable-diffusion-xl-refiner-0.9"
-refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-image = pipeline(prompt=prompt, output_type="latent" if use_refiner else "pil").images[0]
-image = refiner(prompt=prompt, image=image[None, :]).images[0]
-image.save("sailing_ship.png")
-```

From 958fa661c930b3f23ff709407309863875878a8c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 18 Jul 2023 16:33:25 +0200
Subject: [PATCH 05/20] Remove graphcore from documentation quickstart (#1201)

---
 docs/source/installation.mdx |  1 -
 docs/source/quicktour.mdx    | 87 ++++++++++--------------------------
 2 files changed, 24 insertions(+), 64 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 895ead2566..1ed7fa609a 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -25,7 +25,6 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 | [ONNX runtime](https://onnxruntime.ai/docs/)                                                                           | `python -m pip install optimum[onnxruntime]`      |
 | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `python -m pip install optimum[neural-compressor]`|
 | [Intel OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `python -m pip install optimum[openvino,nncf]`    |
-| [Graphcore IPU](https://www.graphcore.ai/products/ipu)                                                                 | `python -m pip install optimum[graphcore]`        |
 | [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `python -m pip install optimum[habana]`           |
 
 
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 507aee155e..11b0bb1c18 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -16,6 +16,30 @@ This quick tour is intended for developers who are ready to dive into the code a
 
 ## Accelerated inference
 
+#### OpenVINO
+
+To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
+If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR (Intermediate Representation).
+
+```diff
+- from transformers import AutoModelForSequenceClassification
++ from optimum.intel.openvino import OVModelForSequenceClassification
+  from transformers import AutoTokenizer, pipeline
+
+  # Download a tokenizer and model from the Hub and convert to OpenVINO format
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+- model = AutoModelForSequenceClassification.from_pretrained(model_id)
++ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
+
+  # Run inference!
+  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
+  results = classifier("He's a dreadful magician.")
+```
+
+You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
+
+
 #### ONNX Runtime
 
 To accelerate inference with ONNX Runtime, 🤗 Optimum uses _configuration objects_ to define parameters for graph optimization and quantization. These objects are then used to instantiate dedicated _optimizers_ and _quantizers_.
@@ -67,30 +91,6 @@ In this example, we've quantized a model from the Hugging Face Hub, in the same
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/quickstart) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime).
 
 
-#### Intel
-
-To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
-If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR (Intermediate Representation).
-
-```diff
-- from transformers import AutoModelForSequenceClassification
-+ from optimum.intel.openvino import OVModelForSequenceClassification
-  from transformers import AutoTokenizer, pipeline
-
-  # Download a tokenizer and model from the Hub and convert to OpenVINO format
-  tokenizer = AutoTokenizer.from_pretrained(model_id)
-  model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-- model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
-
-  # Run inference!
-  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
-  results = classifier("He's a dreadful magician.")
-```
-
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/inference) and in the [examples](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino).
-
-
 ## Accelerated training
 
 #### Habana
@@ -130,45 +130,6 @@ To train transformers on Habana's Gaudi processors, 🤗 Optimum provides a `Gau
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/habana/quickstart) and in the [examples](https://github.com/huggingface/optimum-habana/tree/main/examples).
 
 
-#### Graphcore
-
-To train transformers on Graphcore's IPUs, 🤗 Optimum provides a `IPUTrainer` that is very similar to the 🤗 Transformers [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer). Here is a simple example:
-
-```diff
-- from transformers import Trainer, TrainingArguments
-+ from optimum.graphcore import IPUConfig, IPUTrainer, IPUTrainingArguments
-
-  # Download a pretrained model from the Hub
-  model = AutoModelForXxx.from_pretrained("bert-base-uncased")
-
-  # Define the training arguments
-- training_args = TrainingArguments(
-+ training_args = IPUTrainingArguments(
-      output_dir="path/to/save/folder/",
-+     ipu_config_name="Graphcore/bert-base-ipu", # Any IPUConfig on the Hub or stored locally
-      ...
-  )
-
-  # Define the configuration to compile and put the model on the IPU
-+ ipu_config = IPUConfig.from_pretrained(training_args.ipu_config_name)
-
-  # Initialize the trainer
-- trainer = Trainer(
-+ trainer = IPUTrainer(
-      model=model,
-+     ipu_config=ipu_config
-      args=training_args,
-      train_dataset=train_dataset
-      ...
-  )
-
-  # Use Graphcore IPU for training!
-  trainer.train()
-```
-
-You can find more examples in the [documentation](https://huggingface.co/docs/optimum/graphcore/quickstart) and in the [examples](https://github.com/huggingface/optimum-graphcore/tree/main/examples).
-
-
 #### ONNX Runtime
 
 To train transformers with ONNX Runtime's acceleration features, 🤗 Optimum provides a `ORTTrainer` that is very similar to the 🤗 Transformers [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer). Here is a simple example:

From 77c690bdcc431435f98c7374498b5fb21d767453 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 19 Jul 2023 17:10:16 +0200
Subject: [PATCH 06/20] Unpin tensorflow (#1211)

* unpin

* unpin in the CI as well
---
 .github/workflows/test_onnx.yml | 2 +-
 setup.py                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 0cd5810507..965e11d1f9 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -27,7 +27,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        pip install .[tests,onnxruntime] "tensorflow<2.12.0" tf2onnx
+        pip install .[tests,onnxruntime] tensorflow tf2onnx
     - name: Test with unittest
       working-directory: tests
       run: |
diff --git a/setup.py b/setup.py
index 7da8e334da..df3adc155e 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,7 @@
     ],
     "exporters": ["onnx", "onnxruntime", "timm"],
     "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
-    "exporters-tf": ["tensorflow>=2.4,<2.11", "tf2onnx", "onnx", "onnxruntime", "timm", "h5py", "numpy<1.24.0"],
+    "exporters-tf": ["tensorflow>=2.4", "tf2onnx", "onnx", "onnxruntime", "timm", "h5py", "numpy<1.24.0"],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel",
     "openvino": "optimum-intel[openvino]",

From 21943aa305c64c6db79af440b4a1152b64cad38b Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 19 Jul 2023 17:24:30 +0200
Subject: [PATCH 07/20] Fix ORT test for unknown architecture for task (#1212)

* fix

* fix style

* git old test
---
 optimum/exporters/onnx/__main__.py       | 2 +-
 tests/exporters/onnx/test_onnx_export.py | 2 +-
 tests/onnxruntime/test_modeling.py       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 696cb86823..6cefc7c571 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -322,7 +322,7 @@ def main_export(
     # TODO: support onnx_config.py in the model repo
     if custom_architecture and custom_onnx_configs is None:
         raise ValueError(
-            "Trying to export a model with a custom architecture, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models."
+            f"Trying to export a {model.config.model_type.replace('-', '_')} model, that is a custom or unsupported architecture for the task {task}, but no custom onnx configuration was passed as `custom_onnx_configs`. Please refer to https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#custom-export-of-transformers-models for an example on how to export custom models. For the task {task}, the Optimum ONNX exporter supports natively the architectures: {TasksManager.get_supported_model_type_for_task(task, exporter='onnx')}."
         )
 
     if custom_architecture and original_task == "auto":
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index c97c9ff58c..9a96d13e47 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -587,4 +587,4 @@ def test_custom_export_trust_remote_error(self):
                     no_post_process=True,
                 )
 
-        self.assertIn("export a model with a custom architecture, but no custom onnx", str(context.exception))
+        self.assertIn("custom or unsupported architecture", str(context.exception))
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index b7dc9ca381..0ba847a0d6 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -1089,7 +1089,7 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
             _ = ORTModelForQuestionAnswering.from_pretrained(MODEL_NAMES["t5"], export=True)
 
-        self.assertIn("Unrecognized configuration class", str(context.exception))
+        self.assertIn("custom or unsupported architecture", str(context.exception))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):

From f0228ad2590f5b5d5359472e322cce676ac48d66 Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathikr@usc.edu>
Date: Thu, 20 Jul 2023 02:18:38 -0700
Subject: [PATCH 08/20] add ort + stable diffusion documentation (#1205)

Co-authored-by: Prathik Rao <prathikrao@microsoft.com>
---
 .../onnxruntime/usage_guides/trainer.mdx      | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/docs/source/onnxruntime/usage_guides/trainer.mdx b/docs/source/onnxruntime/usage_guides/trainer.mdx
index 6b466b7257..50c6b4d77a 100644
--- a/docs/source/onnxruntime/usage_guides/trainer.mdx
+++ b/docs/source/onnxruntime/usage_guides/trainer.mdx
@@ -236,6 +236,47 @@ in the Optimum repository.
 
 </Tip>
 
+## ORTModule+StableDiffusion
+
+Optimum supports accelerating Hugging Face Diffusers with ONNX Runtime in [this example](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training/stable-diffusion/text-to-image).
+The core changes required to enable ONNX Runtime Training are summarized below:
+
+```diff
+import torch
+from diffusers import AutoencoderKL, UNet2DConditionModel
+from transformers import CLIPTextModel
+
++from onnxruntime.training.ortmodule import ORTModule
++from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer
+
+unet = UNet2DConditionModel.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", 
+    subfolder="unet",
+    ...
+)
+text_encoder = CLIPTextModel.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", 
+    subfolder="text_encoder",
+    ...
+)
+vae = AutoencoderKL.from_pretrained(
+    "CompVis/stable-diffusion-v1-4", 
+    subfolder="vae",
+    ...
+)
+
+optimizer = torch.optim.AdamW(
+    unet.parameters(),
+    ...
+)
+
++vae = ORTModule(vae)
++text_encoder = ORTModule(text_encoder)
++unet = ORTModule(unet)
+
++optimizer = ORT_FP16_Optimizer(optimizer)
+```
+
 ## Other Resources
 
 * Blog posts

From d7d10990a262e273e53d2eb32b47d299f1320f66 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Thu, 20 Jul 2023 13:12:23 +0200
Subject: [PATCH 09/20] Fix vision encoder decoder that may not cache
 cross-attention (#1210)

* fix vision encoder decoder

* add test
---
 optimum/onnxruntime/base.py             | 10 ++++-
 optimum/onnxruntime/modeling_seq2seq.py |  5 +++
 tests/onnxruntime/test_modeling.py      | 54 +++++++++++++++++++------
 3 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index 4f3b9c895f..59a21f944d 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -433,7 +433,13 @@ def __init__(
     ):
         super().__init__(session, parent_model)
 
-        if self.parent_model.use_merged is False and self.use_past is True:
+        # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2
+        # can be used but do not support KV caching for the cross-attention key/values, see:
+        # https://github.com/huggingface/transformers/blob/v4.31.0/src/transformers/models/gpt2/modeling_gpt2.py#L302-L311
+        # This attribute is used to avoid returning cross-attention KV-cache in this case.
+        self.no_cross_attention_cache = getattr(self.parent_model, "no_cross_attention_cache", False)
+
+        if (not self.parent_model.use_merged and self.use_past) or self.no_cross_attention_cache:
             self.num_pkv = 2
         else:
             # When using a merged model, we always have the same number of output whether we use past key values or not,
@@ -688,7 +694,7 @@ def forward(
             # Tuple of tuple of length `n_layers`, with each tuple of length equal to:
             # * 4 for the decoder without cache (k/v of self-attention + k/v of cross-attention)
             # * 2 for the decoder with cache (k/v of self-attention as cross-attention cache is constant)
-            if self.use_past is False or use_merged_no_cache:
+            if not self.use_past or use_merged_no_cache or self.no_cross_attention_cache:
                 out_past_key_values = tuple(
                     out_past_key_values[i : i + self.num_pkv] for i in range(0, len(out_past_key_values), self.num_pkv)
                 )
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index cdd8d1b6cd..ee09713390 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -1244,6 +1244,11 @@ def __init__(
         generation_config: Optional[GenerationConfig] = None,
         **kwargs,
     ):
+        # There are probably other archs that do not support cross attention KV cache, but only
+        # this one seem popular on the Hub.
+        if config.decoder.model_type == "gpt2":
+            self.no_cross_attention_cache = True
+
         super().__init__(
             encoder_session,
             decoder_session,
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 0ba847a0d6..6ffbbb7732 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -4023,28 +4023,56 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         feature_extractor, tokenizer = self._get_preprocessors(model_id)
 
         data = self._get_sample_image()
-        features = feature_extractor(data, return_tensors="pt")
 
         start_token = "<s>"
         decoder_start_token_id = tokenizer.encode(start_token)[0]
-        decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
 
-        with torch.no_grad():
-            transformers_outputs = transformers_model(**features, **decoder_inputs)
+        extra_inputs = [{}, {}]
 
-        for input_type in ["pt", "np"]:
-            features = feature_extractor(data, return_tensors=input_type)
+        if use_cache and False:
+            # TODO: the dims will fail with other models
+            fake_pkv = tuple((torch.rand(1, 4, 1, 8), torch.rand(1, 4, 1, 8)) for _ in range(5))
+            extra_inputs[1]["past_key_values"] = fake_pkv
 
-            if input_type == "np":
-                decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id}
+        for extra_inps in extra_inputs:
+            features = feature_extractor(data, return_tensors="pt")
+            decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id}
 
-            onnx_outputs = onnx_model(**features, **decoder_inputs)
+            with torch.no_grad():
+                transformers_outputs = transformers_model(**features, **decoder_inputs, **extra_inps)
+            for input_type in ["pt", "np"]:
+                features = feature_extractor(data, return_tensors=input_type)
 
-            self.assertTrue("logits" in onnx_outputs)
-            self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type])
+                if input_type == "np":
+                    decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id}
 
-            # Compare tensor outputs
-            self.assertTrue(torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3))
+                    if "past_key_values" in extra_inps:
+                        del extra_inps["past_key_values"]  # test only with pytorch
+
+                onnx_outputs = onnx_model(**features, **decoder_inputs, **extra_inps)
+
+                self.assertTrue("logits" in onnx_outputs)
+                self.assertIsInstance(onnx_outputs.logits, self.TENSOR_ALIAS_TO_TYPE[input_type])
+
+                if use_cache:
+                    self.assertEqual(
+                        len(onnx_outputs["past_key_values"]), len(transformers_outputs["past_key_values"])
+                    )
+                    self.assertEqual(
+                        len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0])
+                    )
+                    for i, _ in enumerate(onnx_outputs["past_key_values"]):
+                        for j, ort_pkv in enumerate(onnx_outputs["past_key_values"][i]):
+                            trfs_pkv = transformers_outputs["past_key_values"][i][j]
+                            self.assertTrue(
+                                torch.allclose(ort_pkv, trfs_pkv, atol=1e-3),
+                                f" Maxdiff: {torch.abs(ort_pkv - trfs_pkv).max()}",
+                            )
+
+                # Compare tensor outputs
+                self.assertTrue(
+                    torch.allclose(torch.Tensor(onnx_outputs.logits), transformers_outputs.logits, atol=1e-3)
+                )
 
         gc.collect()
 

From e72901cc3937d6bb647e8a63648368da9b560759 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 21 Jul 2023 11:44:22 +0200
Subject: [PATCH 10/20] Add documentation for Optimum Furiosa (#1165)

* Add documentation for Optimum Furiosa

* Add furiosa-libnux installation

* Update main doc build

* Refinement

* Refinements

* Revert section rename

* Fix
---
 .../workflows/build_main_documentation.yml    | 30 ++++++++++++++++++-
 .github/workflows/build_pr_documentation.yml  | 30 ++++++++++++++++++-
 docs/combine_docs.py                          |  5 ++--
 docs/source/index.mdx                         |  8 ++---
 4 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index b35a7a74a6..e82043be98 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -44,6 +44,11 @@ jobs:
           repository: 'huggingface/optimum-intel'
           path: optimum-intel
 
+      - uses: actions/checkout@v2
+        with:
+          repository: 'huggingface/optimum-furiosa'
+          path: optimum-furiosa
+
       - name: Set environment variables
         run: |
           cd optimum
@@ -76,6 +81,7 @@ jobs:
 
       - name: Make Habana documentation
         run: |
+          sudo docker system prune -a -f
           cd optimum-habana
           make doc BUILD_DIR=habana-doc-build VERSION=${{ env.VERSION }}
           sudo mv habana-doc-build ../optimum
@@ -83,11 +89,33 @@ jobs:
 
       - name: Make Intel documentation
         run: |
+          sudo docker system prune -a -f
           cd optimum-intel
           make doc BUILD_DIR=intel-doc-build VERSION=${{ env.VERSION }}
           sudo mv intel-doc-build ../optimum
           cd ..
 
+      - name: Make Furiosa documentation
+        run: |
+          cd optimum-furiosa
+          pip install .
+          sudo apt update
+          sudo apt install -y ca-certificates apt-transport-https gnupg
+          sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 5F03AFA423A751913F249259814F888B20B09A7E
+          sudo tee -a /etc/apt/auth.conf.d/furiosa.conf > /dev/null <<EOT
+            machine archive.furiosa.ai
+            login ${{ secrets.FURIOSA_ACCESS_KEY }}
+            password ${{ secrets.FURIOSA_SECRET_ACCESS_KEY }}
+          EOT
+          sudo chmod 400 /etc/apt/auth.conf.d/furiosa.conf
+          sudo tee -a /etc/apt/sources.list.d/furiosa.list <<EOT
+            deb [arch=amd64] https://archive.furiosa.ai/ubuntu jammy restricted
+          EOT
+          sudo apt update && sudo apt install -y furiosa-libnux
+          doc-builder build optimum.furiosa docs/source/ --build_dir furiosa-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
+          mv furiosa-doc-build ../optimum
+          cd ..
+
       - name: Make Optimum documentation
         run: |
           sudo docker system prune -a -f
@@ -101,7 +129,7 @@ jobs:
       - name: Combine subpackage documentation
         run: |
           cd optimum
-          sudo python docs/combine_docs.py --subpackages graphcore habana intel neuron --version ${{ env.VERSION }}
+          sudo python docs/combine_docs.py --subpackages graphcore habana intel neuron furiosa --version ${{ env.VERSION }}
           cd ..
 
       - name: Push to repositories
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 1968249f44..62d796a7a3 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -43,6 +43,11 @@ jobs:
           repository: 'huggingface/optimum-intel'
           path: optimum-intel
 
+      - uses: actions/checkout@v2
+        with:
+          repository: 'huggingface/optimum-furiosa'
+          path: optimum-furiosa
+
       - name: Setup environment
         run: |
           pip uninstall -y doc-builder
@@ -61,6 +66,7 @@ jobs:
 
       - name: Make Habana documentation
         run: |
+          sudo docker system prune -a -f
           cd optimum-habana
           make doc BUILD_DIR=habana-doc-build VERSION=pr_$PR_NUMBER
           sudo mv habana-doc-build ../optimum
@@ -68,11 +74,33 @@ jobs:
 
       - name: Make Intel documentation
         run: |
+          sudo docker system prune -a -f
           cd optimum-intel
           make doc BUILD_DIR=intel-doc-build VERSION=pr_$PR_NUMBER
           sudo mv intel-doc-build ../optimum
           cd ..
 
+      - name: Make Furiosa documentation
+        run: |
+          cd optimum-furiosa
+          pip install .
+          sudo apt update
+          sudo apt install -y ca-certificates apt-transport-https gnupg
+          sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 5F03AFA423A751913F249259814F888B20B09A7E
+          sudo tee -a /etc/apt/auth.conf.d/furiosa.conf > /dev/null <<EOT
+            machine archive.furiosa.ai
+            login ${{ secrets.FURIOSA_ACCESS_KEY }}
+            password ${{ secrets.FURIOSA_SECRET_ACCESS_KEY }}
+          EOT
+          sudo chmod 400 /etc/apt/auth.conf.d/furiosa.conf
+          sudo tee -a /etc/apt/sources.list.d/furiosa.list <<EOT
+            deb [arch=amd64] https://archive.furiosa.ai/ubuntu jammy restricted
+          EOT
+          sudo apt update && sudo apt install -y furiosa-libnux
+          doc-builder build optimum.furiosa docs/source/ --build_dir furiosa-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
+          mv furiosa-doc-build ../optimum
+          cd ..
+
       - name: Make Optimum documentation
         run: |
           sudo docker system prune -a -f
@@ -83,7 +111,7 @@ jobs:
       - name: Combine subpackage documentation
         run: |
           cd optimum
-          sudo python docs/combine_docs.py --subpackages graphcore habana intel neuron --version pr_$PR_NUMBER
+          sudo python docs/combine_docs.py --subpackages graphcore habana intel neuron furiosa --version pr_$PR_NUMBER
           sudo mv optimum-doc-build ../
           cd ..
 
diff --git a/docs/combine_docs.py b/docs/combine_docs.py
index 8715f1a8f4..ea36c800d4 100755
--- a/docs/combine_docs.py
+++ b/docs/combine_docs.py
@@ -34,9 +34,6 @@ def rename_subpackage_toc(subpackage: str, toc: Dict):
                 # if "local" is not in file, it means we have a subsection, hence the recursive call
                 rename_subpackage_toc(subpackage, [file])
 
-    # Just keep the name of the partner
-    toc[0]["title"] = toc[0]["title"].split("Optimum ")[-1]
-
 
 def rename_copy_subpackage_html_paths(subpackage: str, subpackage_path: Path, optimum_path: Path, version: str):
     """
@@ -138,6 +135,8 @@ def main():
                 subpackage_toc = yaml.safe_load(f)
             # Extend table of contents sections with the subpackage name as the parent folder
             rename_subpackage_toc(subpackage, subpackage_toc)
+            # Just keep the name of the partner in the TOC title
+            subpackage_toc[0]["title"] = subpackage_toc[0]["title"].split("Optimum ")[-1]
             if subpackage != "graphcore":
                 # Update optimum table of contents
                 base_toc.insert(1, subpackage_toc[0])
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 7dc6acc607..04f2af3aa9 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -33,14 +33,14 @@ As such, Optimum enables developers to efficiently use any of these platforms wi
       ><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>
       <p class="text-gray-700">Accelerate your training and inference workflows with <a href="https://aws.amazon.com/machine-learning/trainium/">AWS Trainium</a> and <a href="https://aws.amazon.com/machine-learning/inferentia/">AWS Inferentia</a></p>
     </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./furiosa/index"
+      ><div class="w-full text-center bg-gradient-to-br from-green-400 to-green-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">FuriosaAI</div>
+      <p class="text-gray-700">Fast and efficient inference on <a href="https://www.furiosa.ai/">FuriosaAI WARBOY</a></p>
+    </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./onnxruntime/overview"
       ><div class="w-full text-center bg-gradient-to-br from-pink-400 to-pink-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">ONNX Runtime</div>
       <p class="text-gray-700">Apply quantization and graph optimization to accelerate Transformers models training and inference with <a href="https://onnxruntime.ai/">ONNX Runtime</a></p>
     </a>
-    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./exporters/overview"
-      ><div class="w-full text-center bg-gradient-to-br from-green-400 to-green-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Exporters</div>
-      <p class="text-gray-700">Export your PyTorch or TensorFlow model to different formats such as ONNX and TFLite</p>
-    </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./bettertransformer/overview"
       ><div class="w-full text-center bg-gradient-to-br from-yellow-400 to-yellow-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">BetterTransformer</div>
       <p class="text-gray-700">A one-liner integration to use <a href="https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/">PyTorch's BetterTransformer</a> with Transformers models</p>

From c146b756a301b7baa24715fd743c4219e33c9f83 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Fri, 21 Jul 2023 13:51:18 +0200
Subject: [PATCH 11/20] Add BLIP-2 to BetterTransformer documentation (#1218)

fix
---
 docs/source/bettertransformer/overview.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx
index 13c6c66626..80b47285ee 100644
--- a/docs/source/bettertransformer/overview.mdx
+++ b/docs/source/bettertransformer/overview.mdx
@@ -32,6 +32,7 @@ The list of supported model below:
 - [BART](https://arxiv.org/abs/1910.13461)
 - [BERT](https://arxiv.org/abs/1810.04805)
 - [BERT-generation](https://arxiv.org/abs/1907.12461)
+- [BLIP-2](https://arxiv.org/abs/2301.12597)
 - [CamemBERT](https://arxiv.org/abs/1911.03894)
 - [CLIP](https://arxiv.org/abs/2103.00020)
 - [CodeGen](https://arxiv.org/abs/2203.13474)

From 29675d53d6715ecae6020bd46159aad008166865 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 24 Jul 2023 15:57:22 +0200
Subject: [PATCH 12/20] Set default value to unet config sample size (#1223)

---
 optimum/pipelines/diffusers/pipeline_stable_diffusion.py      | 4 ++--
 .../pipelines/diffusers/pipeline_stable_diffusion_inpaint.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
index c133f8c6d2..0f5b3c3b33 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -286,8 +286,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
+        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
 
         # check inputs. Raise error if not correct
         self.check_inputs(
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
index 07a808acab..e2a7ac7c9e 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -187,8 +187,8 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
+        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
+        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
 
         # check inputs. Raise error if not correct
         self.check_inputs(

From d5484d503b9a338bee0df33643cb253cbcd6a8bf Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 24 Jul 2023 16:20:20 +0200
Subject: [PATCH 13/20] Fix broken link in doc (#1222)

---
 docs/source/onnxruntime/usage_guides/quantization.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/onnxruntime/usage_guides/quantization.mdx b/docs/source/onnxruntime/usage_guides/quantization.mdx
index 27ba00184e..8ffe16f3d6 100644
--- a/docs/source/onnxruntime/usage_guides/quantization.mdx
+++ b/docs/source/onnxruntime/usage_guides/quantization.mdx
@@ -22,7 +22,7 @@ while the latter effectively handles quantization.
 
 <Tip>
 
-You can read the [conceptual guide on quantization](/concept_guides/quantization) to learn about quantization. It
+You can read the [conceptual guide on quantization](../../concept_guides/quantization) to learn about quantization. It
 explains the main concepts that you will be using when performing quantization with the
 [`~optimum.onnxruntime.ORTQuantizer`].
 
@@ -63,7 +63,7 @@ Quantizing an ONNX model can be done as follows:
  optimum-cli onnxruntime quantize --onnx_model onnx_model_location/ --avx512 -o quantized_model/
 ```
 
-This quantize all the ONNX files in `onnx_model_location` with the AVX-512 instructions. 
+This quantize all the ONNX files in `onnx_model_location` with the AVX-512 instructions.
 
 ## Creating an `ORTQuantizer`
 

From 9a183ba8249ff4b8871216eafbeed4a78fb5fee0 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 24 Jul 2023 16:39:04 +0200
Subject: [PATCH 14/20] Fix BT test (#1224)

* fix?

* hopefully pass
---
 optimum/bettertransformer/models/attention.py | 3 ---
 tests/bettertransformer/test_decoder.py       | 6 ------
 2 files changed, 9 deletions(-)

diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 2d81cbb2a1..574636bd25 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -589,9 +589,6 @@ def llama_forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
 
-        # This line is necessary for numerical equivalence, although I'm not sure it is useful in any way.
-        attention_mask = torch.max(attention_mask, torch.tensor(torch.finfo(attention_mask.dtype).min))
-
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states, key_states, value_states, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
         )
diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py
index d446a08385..3fb92ab126 100644
--- a/tests/bettertransformer/test_decoder.py
+++ b/tests/bettertransformer/test_decoder.py
@@ -49,9 +49,6 @@ def prepare_inputs_for_class(self, model_id: str, model_type: str, batch_size: i
             texts = ["a dummy input yeah!"] + ["and two"] * (batch_size - 1)
         inputs = tokenizer(texts, return_tensors="pt", padding=padding, max_length=20, **preprocessor_kwargs)
 
-        if model_type == "llama":
-            del inputs["token_type_ids"]
-
         return inputs
 
     @parameterized.expand(
@@ -158,9 +155,6 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
             text.append("Please continue this my dear me")
         inp = tokenizer(text, return_tensors="pt", padding=padding, max_length=30)
 
-        if model_type == "llama":
-            del inp["token_type_ids"]
-
         length = 50
         result_vanilla = model.generate(**inp, num_beams=1, min_length=length, max_length=length)
 

From d56ccf8136718f725ce5606dae68efa46758bb85 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 24 Jul 2023 18:27:30 +0200
Subject: [PATCH 15/20] Add SD XL documentation (#1198)

---
 .../package_reference/modeling_ort.mdx        |  9 ++++
 .../onnxruntime/usage_guides/models.mdx       | 51 +++++++++++++++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
index bd6899f574..ebbfa1736e 100644
--- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx
+++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -121,3 +121,12 @@ The following ORT classes are available for the following custom tasks.
 #### ORTStableDiffusionInpaintPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionInpaintPipeline
+
+
+#### ORTStableDiffusionXLPipeline
+
+[[autodoc]] onnxruntime.ORTStableDiffusionXLPipeline
+
+#### ORTStableDiffusionXLImg2ImgPipeline
+
+[[autodoc]] onnxruntime.ORTStableDiffusionXLImg2ImgPipeline
\ No newline at end of file
diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 1098f063c5..634c88fc0b 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -64,7 +64,7 @@ It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to
 ... )
 ```
 
-## Export and inference of sequence-to-sequence models
+## Sequence-to-sequence models
 
 Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models
 are exported to the ONNX format, they are decomposed into three parts that are later combined during inference:
@@ -92,7 +92,7 @@ Here is an example of how you can load a T5 model to the ONNX format and run inf
 >>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
-## Export and inference of Stable Diffusion models
+## Stable Diffusion
 
 Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models
 are exported to the ONNX format, they are split into four components that are later combined during inference:
@@ -104,7 +104,7 @@ are exported to the ONNX format, they are split into four components that are la
 Make sure you have 🤗 Diffusers installed.
 
 To install `diffusers`:
-```
+```bash
 pip install diffusers
 ```
 
@@ -183,3 +183,48 @@ mask_image = download_image(mask_url).resize((512, 512))
 prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
 image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
 ```
+
+
+## Stable Diffusion XL
+
+Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
+
+```bash
+pip install diffusers
+pip install invisible-watermark>=2.0
+```
+
+### Text-to-Image
+
+Here is an example of how you can load a PyTorch SD XL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime:
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionXLPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-base-0.9"
+pipeline = ORTStableDiffusionXLPipeline.from_pretrained(model_id, export=True)
+prompt = "sailing ship in storm by Leonardo da Vinci"
+image = pipeline(prompt).images[0]
+
+# Don't forget to save the ONNX model
+save_directory = "a_local_path"
+pipeline.save_pretrained(save_directory)
+
+```
+
+### Image-to-Image
+
+The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). In this case, you only have to output the latents from the base model.
+
+
+```python
+from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
+
+use_refiner = True
+model_id = "stabilityai/stable-diffusion-xl-refiner-0.9"
+refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+
+image = pipeline(prompt=prompt, output_type="latent" if use_refiner else "pil").images[0]
+image = refiner(prompt=prompt, image=image[None, :]).images[0]
+image.save("sailing_ship.png")
+```

From 8b034d2043359e0a84aa20c0862b1a90a0233ea6 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 25 Jul 2023 18:45:10 +0530
Subject: [PATCH 16/20] Update setup.py to add optimum-furiosa extras (#1226)

add_optimum_furiosa_extra

Co-authored-by: Mohit Sharma <mohit@huggingface.co>
---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index df3adc155e..90b9ab89f1 100644
--- a/setup.py
+++ b/setup.py
@@ -68,6 +68,7 @@
     "habana": ["transformers<4.29.0", "optimum-habana"],
     "neuron": "optimum-neuron[neuron]",
     "neuronx": "optimum-neuron[neuronx]",
+    "furiosa": "optimum-furiosa",
     "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
     "quality": QUALITY_REQUIRE,

From 43e8004032523b5e6e06b929c862d23550e2a73e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 25 Jul 2023 18:12:22 +0200
Subject: [PATCH 17/20] Dev version

---
 optimum/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/version.py b/optimum/version.py
index 35a66faff1..78887e5a1c 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.9.2.dev0"
+__version__ = "1.10.1.dev0"

From d31cfdeef571161add11fec4b6e1cb7d1dd1ea76 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 26 Jul 2023 10:33:06 +0200
Subject: [PATCH 18/20] Add upgrade strategy installation instruction in doc
 (#1228)

* add upgrade strategy

* fix type

* add furiosa
---
 README.md                    | 21 ++++++++++++---------
 docs/source/installation.mdx | 14 ++++++++------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index a297e46850..3181e3308a 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,13 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 
 | Accelerator                                                                                                            | Installation                                      |
 |:-----------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------|
-| [ONNX Runtime](https://onnxruntime.ai/docs/)                                                                           | `python -m pip install optimum[onnxruntime]`      |
-| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html)       | `python -m pip install optimum[neural-compressor]`|
-| [OpenVINO](https://docs.openvino.ai/latest/index.html)                                                                 | `python -m pip install optimum[openvino,nncf]`    |
-| [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `python -m pip install optimum[habana]`           |
+| [ONNX Runtime](https://onnxruntime.ai/docs/)                                                                           | `pip install --upgrade-strategy eager optimum[onnxruntime]`       |
+| [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html)       | `pip install --upgrade-strategy eager optimum[neural-compressor]`|
+| [OpenVINO](https://docs.openvino.ai/latest/index.html)                                                                 | `pip install --upgrade-strategy eager optimum[openvino,nncf]`    |
+| [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `pip install --upgrade-strategy eager optimum[habana]`           |
+| [FuriosaAI](https://www.furiosa.ai/)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`          |
+
++The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
 To install from source:
 
@@ -27,10 +30,10 @@ To install from source:
 python -m pip install git+https://github.com/huggingface/optimum.git
 ```
 
-For the accelerator-specific features, append `#egg=optimum[accelerator_type]` to the above command:
+For the accelerator-specific features, append `optimum[accelerator_type]` to the above command:
 
 ```bash
-python -m pip install git+https://github.com/huggingface/optimum.git#egg=optimum[onnxruntime]
+python -m pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
 ```
 
 ## Accelerated Inference
@@ -59,7 +62,7 @@ The [export](https://huggingface.co/docs/optimum/exporters/overview) and optimiz
 
 ### OpenVINO
 
-This requires to install the OpenVINO extra by doing `pip install optimum[openvino,nncf]`
+This requires to install the OpenVINO extra by doing `pip install --upgrade-strategy eager optimum[openvino,nncf]`
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -82,7 +85,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 
 ### Neural Compressor
 
-This requires to install the Neural Compressor extra by doing `pip install optimum[neural-compressor]`
+This requires to install the Neural Compressor extra by doing `pip install --upgrade-strategy eager optimum[neural-compressor]`
 
 Dynamic quantization can be applied on your model:
 
@@ -167,7 +170,7 @@ We support many providers:
 
 ### Habana
 
-This requires to install the Habana extra by doing `pip install optimum[habana]`
+This requires to install the Habana extra by doing `pip install --upgrade-strategy eager optimum[habana]`
 
 ```diff
 - from transformers import Trainer, TrainingArguments
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 1ed7fa609a..5313839dfd 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -22,11 +22,13 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 
 | Accelerator                                                                                                            | Installation                                      |
 |:-----------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------|
-| [ONNX runtime](https://onnxruntime.ai/docs/)                                                                           | `python -m pip install optimum[onnxruntime]`      |
-| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `python -m pip install optimum[neural-compressor]`|
-| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `python -m pip install optimum[openvino,nncf]`    |
-| [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `python -m pip install optimum[habana]`           |
+| [ONNX runtime](https://onnxruntime.ai/docs/)                                                                           | `pip install --upgrade-strategy eager install optimum[onnxruntime]`|
+| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager optimum[neural-compressor]`  |
+| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager optimum[openvino,nncf]`      |
+| [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `pip install --upgrade-strategy eager optimum[habana]`             |
+| [FuriosaAI](https://www.furiosa.ai/)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`            |
 
+The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
 If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you can install the base library from source as follows:
 
@@ -34,8 +36,8 @@ If you'd like to play with the examples or need the bleeding edge of the code an
 python -m pip install git+https://github.com/huggingface/optimum.git
 ```
 
-For the accelerator-specific features, you can install them by appending `#egg=optimum[accelerator_type]` to the `pip` command, e.g.
+For the accelerator-specific features, you can install them by appending `optimum[accelerator_type]` to the `pip` command, e.g.
 
 ```bash
-python -m pip install git+https://github.com/huggingface/optimum.git#egg=optimum[onnxruntime]
+python -m pip install optimum[onnxruntime]@git+https://github.com/huggingface/optimum.git
 ```

From a266d5cc2fdbe34a4e90fedf7f69d5394a3eaf5b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 26 Jul 2023 10:35:49 +0200
Subject: [PATCH 19/20] fix typo README (#1230)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3181e3308a..df71905d70 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 | [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `pip install --upgrade-strategy eager optimum[habana]`           |
 | [FuriosaAI](https://www.furiosa.ai/)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`          |
 
-+The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
+The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
 To install from source:
 

From 38061a66848571a37dce5ef71fc4695736e01a8b Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 26 Jul 2023 19:03:06 +0200
Subject: [PATCH 20/20] BetterTransformer support training & autocast for all
 archs (#1225)

* support training

* encoders and encoder+decoder all work

* warning about training decoders with padding

* leave to an other PR the backward for some archs

* nit

* fix tests

* hopefully tests pass

* fix
---
 optimum/bettertransformer/models/__init__.py  |   17 +-
 optimum/bettertransformer/models/base.py      |   11 -
 .../models/decoder_models.py                  |   25 -
 .../models/encoder_models.py                  | 1196 ++++++++++-------
 optimum/bettertransformer/transformation.py   |   28 +-
 optimum/utils/testing_utils.py                |    6 -
 tests/bettertransformer/test_audio.py         |   26 +-
 tests/bettertransformer/test_common.py        |   13 +-
 tests/bettertransformer/test_decoder.py       |   37 +-
 tests/bettertransformer/test_encoder.py       |   96 +-
 .../bettertransformer/test_encoder_decoder.py |   33 +-
 tests/bettertransformer/test_vision.py        |   17 +-
 tests/bettertransformer/testing_utils.py      |  130 +-
 13 files changed, 938 insertions(+), 697 deletions(-)

diff --git a/optimum/bettertransformer/models/__init__.py b/optimum/bettertransformer/models/__init__.py
index 6029b0e31d..56f907125b 100644
--- a/optimum/bettertransformer/models/__init__.py
+++ b/optimum/bettertransformer/models/__init__.py
@@ -148,18 +148,14 @@ class BetterTransformerManager:
         "t5",
     }
 
-    REQUIRES_TORCH_20 = {
+    DO_NOT_SUPPORT_PADDED_TRAINING = {
         "blenderbot",
-        "bart",
         "codegen",
         "gpt2",
         "gptj",
         "gpt_neo",
         "gpt_neox",
         "llama",
-        "m2m_100",
-        "marian",
-        "mbart",
         "opt",
         "pegasus",
         "t5",
@@ -209,17 +205,6 @@ def requires_strict_validation(model_type: str) -> bool:
         """
         return model_type not in BetterTransformerManager.NOT_REQUIRES_STRICT_VALIDATION
 
-    @staticmethod
-    def requires_torch_20(model_type: str) -> bool:
-        """
-        Returns True if the architecture requires PyTorch 2.0 to be used with BetterTransformer.
-
-        Args:
-            model_type (`str`):
-                The model type to check.
-        """
-        return model_type in BetterTransformerManager.REQUIRES_TORCH_20
-
 
 class warn_uncompatible_save(object):
     def __init__(self, callback):
diff --git a/optimum/bettertransformer/models/base.py b/optimum/bettertransformer/models/base.py
index d2e5bb4bba..fdd7fc5eb9 100644
--- a/optimum/bettertransformer/models/base.py
+++ b/optimum/bettertransformer/models/base.py
@@ -55,7 +55,6 @@ def __init__(
         self.num_layers = None
         self.original_layers_mapping = {}
         self.module_mapping = None
-        self.supports_training = False
         # Some models does not have some attributes thus needs to be ignored
         # e.g. whisper does not have self_attn.k_proj.bias but has self_attn.v_proj.bias & self_attn.q_proj.bias
         self.keys_to_ignore = []
@@ -127,16 +126,6 @@ def validate_bettertransformer(self):
                 f" Number of heads must be even."
             )
 
-    def forward_checker(self, *args, **kwargs):
-        if torch.is_autocast_enabled() or torch.is_autocast_cpu_enabled():
-            raise ValueError("Autocast is not supported for `BetterTransformer` integration.")
-
-        if self.training and not self.supports_training:
-            raise ValueError(
-                "Training is not supported for `BetterTransformer` integration.",
-                " Please use `model.eval()` before running the model.",
-            )
-
     def _revert(self, module: torch.nn.Module) -> torch.nn.Module:
         if self.module_mapping is not None:
             if "" in self.module_mapping.values():
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index bb8f890227..bfb45ff317 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -65,16 +65,13 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             setattr(self, "q_attn", getattr(layer, "q_attn"))
             self.original_layers_mapping["q_attn"] = "q_attn"
 
-        self.supports_training = True
         self.downcast_qk = False
         self.dropout_prob_attn = config.attn_pdrop
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return super().forward(*args, **kwargs)
 
 
-# TODO: validate
 class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module):
     _attn = gpt2_wrapped_scaled_dot_product
 
@@ -105,11 +102,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
         self.downcast_qk = True
-        self.supports_training = True
         self.dropout_prob_attn = config.attn_pdrop
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return super().forward(*args, **kwargs)
 
 
@@ -129,11 +124,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
         self.downcast_qk = True
-        self.supports_training = True
         self.dropout_prob_attn = 0.0  # no dropout for gpt-neox
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return super().forward(*args, **kwargs)
 
 
@@ -159,11 +152,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
         self.scale = torch.sqrt(torch.tensor(layer.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
-        self.supports_training = True
         self.dropout_prob_attn = float(config.attention_dropout)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return super().forward(*args, **kwargs)
 
 
@@ -188,11 +179,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
-        self.supports_training = True
         self.dropout_prob_attn = config.attn_pdrop
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return super().forward(*args, **kwargs)
 
 
@@ -218,10 +207,7 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
-        self.supports_training = True
-
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return opt_forward(self, *args, **kwargs)
 
 
@@ -249,11 +235,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
 
-        self.supports_training = True
         self.is_decoder = layer.is_decoder
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return t5_forward(self, *args, **kwargs)
 
 
@@ -274,7 +258,6 @@ def bart_bettertransformer_init(self, layer: "nn.Module", config: "PretrainedCon
 
     self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
-    self.supports_training = True
     self.is_decoder = layer.is_decoder
 
 
@@ -284,7 +267,6 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         bart_bettertransformer_init(self, layer, config)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return bart_forward(self, *args, **kwargs)
 
 
@@ -294,7 +276,6 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         bart_bettertransformer_init(self, layer, config)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return bart_forward(self, *args, **kwargs)
 
 
@@ -304,7 +285,6 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         bart_bettertransformer_init(self, layer, config)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return bart_forward(self, *args, **kwargs)
 
 
@@ -314,7 +294,6 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         bart_bettertransformer_init(self, layer, config)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return bart_forward(self, *args, **kwargs)
 
 
@@ -323,7 +302,6 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         bart_bettertransformer_init(self, layer, config)
 
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return bart_forward(self, *args, **kwargs)
 
 
@@ -339,8 +317,5 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.original_layers_mapping = {submodule: submodule for submodule in submodules}
 
-        self.supports_training = True
-
     def forward(self, *args, **kwargs):
-        super().forward_checker()
         return llama_forward(self, *args, **kwargs)
diff --git a/optimum/bettertransformer/models/encoder_models.py b/optimum/bettertransformer/models/encoder_models.py
index 3913830640..8d9c88a3ea 100644
--- a/optimum/bettertransformer/models/encoder_models.py
+++ b/optimum/bettertransformer/models/encoder_models.py
@@ -15,6 +15,8 @@
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
 
 from .base import BetterTransformerBaseLayer
 
@@ -99,50 +101,100 @@ def __init__(self, albert_layer, config):
             "norm2_weight": "full_layer_layer_norm.weight",
             "norm2_bias": "full_layer_layer_norm.bias",
         }
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+        self.hidden_dropout_prob = config.hidden_dropout_prob
+        self.act_fn_callable = ACT2FN[self.act_fn]
 
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            qkv = F.linear(hidden_states, weight=self.in_proj_weight, bias=self.in_proj_bias)
+
+            qkv = qkv.view(qkv.size()[:-1] + (3, self.num_heads, self.attention_head_size)).permute(2, 0, 3, 1, 4)
+            query, key, value = qkv[0], qkv[1], qkv[2]
+
+            # NOTE: In PyTorch 2.0, passing an attention_mask will automatically dispatch
+            # to the "math" path and will NOT use flash attention / memory-efficient attention.
+            # We should support xformers / Hazy-flash / rocm-flash directly and stop relying on PyTorch to do the work.
+            attention_out = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                is_causal=False,
+                dropout_p=self.attention_probs_dropout_prob if self.training else 0.0,
+            )
 
-        if hidden_states.is_nested:
-            attention_mask = None
+            attention_out = attention_out.permute(0, 2, 1, 3).contiguous()
+            new_attention_out_shape = attention_out.size()[:-2] + (self.num_heads * self.attention_head_size,)
+            attention_out = attention_out.view(new_attention_out_shape)
+
+            # BertSelfOutput
+            attention_out = F.layer_norm(
+                F.dropout(
+                    F.linear(attention_out, self.out_proj_weight, self.out_proj_bias),
+                    p=self.hidden_dropout_prob,
+                    training=self.training,
+                )
+                + hidden_states,
+                normalized_shape=self.norm1_weight.shape,
+                weight=self.norm1_weight,
+                bias=self.norm1_bias,
+            )
 
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
+            # BertIntermediate
+            hidden_states = self.act_fn_callable(F.linear(attention_out, self.linear1_weight, self.linear1_bias))
+
+            # BertOutput
+            hidden_states = F.layer_norm(
+                attention_out
+                + F.dropout(
+                    F.linear(hidden_states, self.linear2_weight, self.linear2_bias),
+                    p=self.hidden_dropout_prob,
+                    training=self.training,
+                ),
+                normalized_shape=self.norm2_weight.shape,
+                weight=self.norm2_weight,
+                bias=self.norm2_bias,
+            )
 
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
         return (hidden_states,)
 
 
@@ -226,50 +278,100 @@ def __init__(self, bert_layer, config):
             "norm2_weight": "output.LayerNorm.weight",
             "norm2_bias": "output.LayerNorm.bias",
         }
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.hidden_dropout_prob = config.hidden_dropout_prob
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+        self.act_fn_callable = ACT2FN[self.act_fn]
 
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            qkv = F.linear(hidden_states, weight=self.in_proj_weight, bias=self.in_proj_bias)
+
+            qkv = qkv.view(qkv.size()[:-1] + (3, self.num_heads, self.attention_head_size)).permute(2, 0, 3, 1, 4)
+            query, key, value = qkv[0], qkv[1], qkv[2]
+
+            # NOTE: In PyTorch 2.0, passing an attention_mask will automatically dispatch
+            # to the "math" path and will NOT use flash attention / memory-efficient attention.
+            # We should support xformers / Hazy-flash / rocm-flash directly and stop relying on PyTorch to do the work.
+            attention_out = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                is_causal=False,
+                dropout_p=self.attention_probs_dropout_prob if self.training else 0.0,
+            )
 
-        if hidden_states.is_nested:
-            attention_mask = None
+            attention_out = attention_out.permute(0, 2, 1, 3).contiguous()
+            new_attention_out_shape = attention_out.size()[:-2] + (self.num_heads * self.attention_head_size,)
+            attention_out = attention_out.view(new_attention_out_shape)
+
+            # BertSelfOutput
+            attention_out = F.layer_norm(
+                F.dropout(
+                    F.linear(attention_out, self.out_proj_weight, self.out_proj_bias),
+                    p=self.hidden_dropout_prob,
+                    training=self.training,
+                )
+                + hidden_states,
+                normalized_shape=self.norm1_weight.shape,
+                weight=self.norm1_weight,
+                bias=self.norm1_bias,
+            )
 
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
+            # BertIntermediate
+            hidden_states = self.act_fn_callable(F.linear(attention_out, self.linear1_weight, self.linear1_bias))
+
+            # BertOutput
+            hidden_states = F.layer_norm(
+                attention_out
+                + F.dropout(
+                    F.linear(hidden_states, self.linear2_weight, self.linear2_bias),
+                    p=self.hidden_dropout_prob,
+                    training=self.training,
+                ),
+                normalized_shape=self.norm2_weight.shape,
+                weight=self.norm2_weight,
+                bias=self.norm2_bias,
+            )
 
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
         return (hidden_states,)
 
 
@@ -350,60 +452,112 @@ def __init__(self, bart_layer, config):
             "norm2_weight": "final_layer_norm.weight",
             "norm2_bias": "final_layer_norm.bias",
         }
+        self.dropout = config.attention_dropout
+        self.activation_dropout = config.activation_dropout
+        self.attention_head_size = config.d_model // config.encoder_attention_heads
+        self.act_fn_callable = ACT2FN[self.act_fn]
 
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if not hasattr(hidden_states, "original_shape"):
+                original_shape = hidden_states.shape
+            else:
+                original_shape = hidden_states.original_shape
+
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                if len(attention_mask.shape) == 4:
+                    attention_mask = attention_mask.squeeze(1)[:, 0]
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
 
-        if not hasattr(hidden_states, "original_shape"):
-            original_shape = hidden_states.shape
+            if not self.is_last_layer:
+                hidden_states.original_shape = original_shape
+            elif hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
         else:
-            original_shape = hidden_states.original_shape
-
-        if hidden_states.is_nested:
-            attention_mask = None
+            qkv = F.linear(hidden_states, weight=self.in_proj_weight, bias=self.in_proj_bias)
+
+            qkv = qkv.view(qkv.size()[:-1] + (3, self.num_heads, self.attention_head_size)).permute(2, 0, 3, 1, 4)
+            query, key, value = qkv[0], qkv[1], qkv[2]
+
+            # NOTE: In PyTorch 2.0, passing an attention_mask will automatically dispatch
+            # to the "math" path and will NOT use flash attention / memory-efficient attention.
+            # We should support xformers / Hazy-flash / rocm-flash directly and stop relying on PyTorch to do the work.
+            attention_out = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                is_causal=False,
+                dropout_p=self.dropout if self.training else 0.0,
+            )
 
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            if len(attention_mask.shape) == 4:
-                attention_mask = attention_mask.squeeze(1)[:, 0]
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
+            attention_out = attention_out.permute(0, 2, 1, 3).contiguous()
+            new_attention_out_shape = attention_out.size()[:-2] + (self.num_heads * self.attention_head_size,)
+            attention_out = attention_out.view(new_attention_out_shape)
+
+            # BertSelfOutput
+            attention_out = F.layer_norm(
+                F.dropout(
+                    F.linear(attention_out, self.out_proj_weight, self.out_proj_bias),
+                    p=self.dropout,
+                    training=self.training,
+                )
+                + hidden_states,
+                normalized_shape=self.norm1_weight.shape,
+                weight=self.norm1_weight,
+                bias=self.norm1_bias,
+            )
 
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
+            # One additional dropout compared to bert
+            hidden_states = F.dropout(
+                self.act_fn_callable(F.linear(attention_out, self.linear1_weight, self.linear1_bias)),
+                p=self.activation_dropout,
+                training=self.training,
+            )
 
-        if not self.is_last_layer:
-            hidden_states.original_shape = original_shape
-        elif hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
+            hidden_states = F.layer_norm(
+                attention_out
+                + F.dropout(
+                    F.linear(hidden_states, self.linear2_weight, self.linear2_bias),
+                    p=self.dropout,
+                    training=self.training,
+                ),
+                normalized_shape=self.norm2_weight.shape,
+                weight=self.norm2_weight,
+                bias=self.norm2_bias,
+            )
         return (hidden_states,)
 
 
@@ -488,60 +642,114 @@ def __init__(self, mbart_layer, config):
             "norm2_bias": "final_layer_norm.bias",
             "norm2_eps": "final_layer_norm.eps",
         }
+        self.dropout = config.attention_dropout
+        self.activation_dropout = config.activation_dropout
+        self.attention_head_size = config.d_model // config.encoder_attention_heads
+        self.act_fn_callable = ACT2FN[self.act_fn]
 
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if not hasattr(hidden_states, "original_shape"):
+                original_shape = hidden_states.shape
+            else:
+                original_shape = hidden_states.original_shape
+
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                if len(attention_mask.shape) == 4:
+                    attention_mask = attention_mask.squeeze(1)[:, 0]
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
 
-        if not hasattr(hidden_states, "original_shape"):
-            original_shape = hidden_states.shape
+            if not self.is_last_layer:
+                hidden_states.original_shape = original_shape
+            elif hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
         else:
-            original_shape = hidden_states.original_shape
+            residual = hidden_states
+            hidden_states = F.layer_norm(
+                hidden_states,
+                normalized_shape=self.norm1_weight.shape,
+                weight=self.norm1_weight,
+                bias=self.norm1_bias,
+            )
 
-        if hidden_states.is_nested:
-            attention_mask = None
+            qkv = F.linear(hidden_states, weight=self.in_proj_weight, bias=self.in_proj_bias)
+            qkv = qkv.view(qkv.size()[:-1] + (3, self.num_heads, self.attention_head_size)).permute(2, 0, 3, 1, 4)
+            query, key, value = qkv[0], qkv[1], qkv[2]
+
+            # NOTE: In PyTorch 2.0, passing an attention_mask will automatically dispatch
+            # to the "math" path and will NOT use flash attention / memory-efficient attention.
+            # We should support xformers / Hazy-flash / rocm-flash directly and stop relying on PyTorch to do the work.
+            attention_out = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attention_mask,
+                is_causal=False,
+                dropout_p=self.dropout if self.training else 0.0,
+            )
 
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            if len(attention_mask.shape) == 4:
-                attention_mask = attention_mask.squeeze(1)[:, 0]
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
+            attention_out = attention_out.permute(0, 2, 1, 3).contiguous()
+            new_attention_out_shape = attention_out.size()[:-2] + (self.num_heads * self.attention_head_size,)
+            attention_out = attention_out.view(new_attention_out_shape)
 
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
+            hidden_states = residual + F.dropout(
+                F.linear(attention_out, self.out_proj_weight, self.out_proj_bias),
+                p=self.dropout,
+                training=self.training,
+            )
+            residual = hidden_states
+            hidden_states = F.layer_norm(
+                hidden_states,
+                normalized_shape=self.norm2_weight.shape,
+                weight=self.norm2_weight,
+                bias=self.norm2_bias,
+            )
+
+            # One additional dropout compared to bert
+            hidden_states = F.dropout(
+                self.act_fn_callable(F.linear(hidden_states, self.linear1_weight, self.linear1_bias)),
+                p=self.activation_dropout,
+                training=self.training,
+            )
+
+            hidden_states = residual + F.dropout(
+                F.linear(hidden_states, self.linear2_weight, self.linear2_bias),
+                p=self.dropout,
+                training=self.training,
+            )
 
-        if not self.is_last_layer:
-            hidden_states.original_shape = original_shape
-        elif hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
         return (hidden_states,)
 
 
@@ -619,54 +827,107 @@ def __init__(self, bert_layer, config):
             "norm2_weight": "output_layer_norm.weight",
             "norm2_bias": "output_layer_norm.bias",
         }
+        self.attention_dropout = config.attention_dropout
+        self.dropout = config.dropout
+        self.attention_head_size = config.dim // config.n_heads
+        self.act_fn_callable = ACT2FN[self.act_fn]
 
         self.validate_bettertransformer()
 
-    def forward(self, x, attn_mask, head_mask=None, output_attentions=None, *_):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-
-        if x.is_nested:
-            attn_mask = None
-
-        if attn_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attn_mask = attn_mask.bool()
-            attn_mask = torch.reshape(attn_mask, (attn_mask.shape[0], attn_mask.shape[-1]))
-            seqlen = attn_mask.shape[1]
-            lengths = torch.sum(~attn_mask, 1)
-            if not all(l == seqlen for l in lengths):
-                x = torch._nested_tensor_from_mask(x, attn_mask)
-            attn_mask = None
-
-        x = torch._transformer_encoder_layer_fwd(
-            x,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attn_mask,
-        )
-        if x.is_nested and self.is_last_layer:
-            x = x.to_padded_tensor(0.0)
-        return (x,)
+    def forward(self, hidden_states, attn_mask, head_mask=None, output_attentions=None, *_):
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if hidden_states.is_nested:
+                attn_mask = None
+
+            if attn_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attn_mask = attn_mask.bool()
+                attn_mask = torch.reshape(attn_mask, (attn_mask.shape[0], attn_mask.shape[-1]))
+                seqlen = attn_mask.shape[1]
+                lengths = torch.sum(~attn_mask, 1)
+                if not all(l == seqlen for l in lengths):
+                    hidden_states = torch._nested_tensor_from_mask(hidden_states, attn_mask)
+                attn_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attn_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            qkv = F.linear(hidden_states, weight=self.in_proj_weight, bias=self.in_proj_bias)
+
+            qkv = qkv.view(qkv.size()[:-1] + (3, self.num_heads, self.attention_head_size)).permute(2, 0, 3, 1, 4)
+            query, key, value = qkv[0], qkv[1], qkv[2]
+
+            # TODO: Kind of stupid to do that at each layer, should be fixed in transformers
+            attn_mask = attn_mask.unsqueeze(1).unsqueeze(2).to(dtype=query.dtype)
+            attn_mask = (1.0 - attn_mask) * torch.finfo(query.dtype).min
+
+            # NOTE: In PyTorch 2.0, passing an attention_mask will automatically dispatch
+            # to the "math" path and will NOT use flash attention / memory-efficient attention.
+            # We should support xformers / Hazy-flash / rocm-flash directly and stop relying on PyTorch to do the work.
+            attention_out = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask=attn_mask,
+                is_causal=False,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+            )
+
+            attention_out = attention_out.permute(0, 2, 1, 3).contiguous()
+            new_attention_out_shape = attention_out.size()[:-2] + (self.num_heads * self.attention_head_size,)
+            attention_out = attention_out.view(new_attention_out_shape)
+
+            # BertSelfOutput
+            attention_out = F.layer_norm(
+                F.dropout(
+                    F.linear(attention_out, self.out_proj_weight, self.out_proj_bias),
+                    p=self.dropout,
+                    training=self.training,
+                )
+                + hidden_states,
+                normalized_shape=self.norm1_weight.shape,
+                weight=self.norm1_weight,
+                bias=self.norm1_bias,
+            )
+
+            # BertIntermediate
+            hidden_states = self.act_fn_callable(F.linear(attention_out, self.linear1_weight, self.linear1_bias))
+
+            # BertOutput
+            hidden_states = F.layer_norm(
+                attention_out
+                + F.dropout(
+                    F.linear(hidden_states, self.linear2_weight, self.linear2_bias),
+                    p=self.dropout,
+                    training=self.training,
+                ),
+                normalized_shape=self.norm2_weight.shape,
+                weight=self.norm2_weight,
+                bias=self.norm2_bias,
+            )
+        return (hidden_states,)
 
 
 class WhisperEncoderLayerBetterTransformer(BetterTransformerBaseLayer, nn.Module):
@@ -749,36 +1010,36 @@ def __init__(self, whisper_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-        attention_mask = None  # attention mask seems to be always None: https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/models/whisper/modeling_whisper.py#L690
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            attention_mask = None  # attention mask seems to be always None: https://github.com/huggingface/transformers/blob/94b3f544a1f5e04b78d87a2ae32a7ac252e22e31/src/transformers/models/whisper/modeling_whisper.py#L690
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            raise NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + Whisper. Please open an issue."
+            )
         return (hidden_states,)
 
 
@@ -869,36 +1130,36 @@ def __init__(self, vit_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-        attention_mask = None
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            raise NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + ViT. Please open an issue."
+            )
         return (hidden_states,)
 
 
@@ -989,36 +1250,36 @@ def __init__(self, vilt_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-        attention_mask = None
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            raise NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + Vilt. Please open an issue."
+            )
         return (hidden_states,)
 
 
@@ -1105,47 +1366,47 @@ def __init__(self, wav2vec2_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-        if hidden_states.is_nested:
-            attention_mask = None
-
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attention_mask = attention_mask.bool()
-            if len(attention_mask.shape) == 4:
-                attention_mask = attention_mask.squeeze(1)[:, 0]
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0)
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attention_mask = attention_mask.bool()
+                if len(attention_mask.shape) == 4:
+                    attention_mask = attention_mask.squeeze(1)[:, 0]
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0)
+        else:
+            raise NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + Wav2Vec2. Please open an issue."
+            )
         return (hidden_states,)
 
 
@@ -1227,61 +1488,61 @@ def __init__(self, fsmt_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, position_bias=None, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-
-        if not hasattr(hidden_states, "original_shape"):
-            original_shape = hidden_states.shape
-        else:
-            original_shape = hidden_states.original_shape
-
-        if hidden_states.is_nested:
-            attention_mask = None
-
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-
-            # FSMT swaps the first two axis before calling the encoder stack
-            # Reference: https://github.com/huggingface/transformers/blob/699e90437f984d69ad3c9b891dd2e9d0fc2cffe4/src/transformers/models/fsmt/modeling_fsmt.py#L508
-            if hidden_states.shape[0] != attention_mask.shape[0]:
-                hidden_states = hidden_states.transpose(1, 0)
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if not hasattr(hidden_states, "original_shape"):
                 original_shape = hidden_states.shape
+            else:
+                original_shape = hidden_states.original_shape
+
+            if hidden_states.is_nested:
+                attention_mask = None
+
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+
+                # FSMT swaps the first two axis before calling the encoder stack
+                # Reference: https://github.com/huggingface/transformers/blob/699e90437f984d69ad3c9b891dd2e9d0fc2cffe4/src/transformers/models/fsmt/modeling_fsmt.py#L508
+                if hidden_states.shape[0] != attention_mask.shape[0]:
+                    hidden_states = hidden_states.transpose(1, 0)
+                    original_shape = hidden_states.shape
+
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
 
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
+            if not self.is_last_layer:
+                hidden_states.original_shape = original_shape
+            elif hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
+        else:
+            raise NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + FSMT. Please open an issue."
+            )
 
-        if not self.is_last_layer:
-            hidden_states.original_shape = original_shape
-        elif hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
         return (hidden_states, attention_mask)
 
 
@@ -1368,54 +1629,54 @@ def __init__(self, prophetnet_layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-
-        if not hasattr(hidden_states, "original_shape"):
-            original_shape = hidden_states.shape
-        else:
-            original_shape = hidden_states.original_shape
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            if not hasattr(hidden_states, "original_shape"):
+                original_shape = hidden_states.shape
+            else:
+                original_shape = hidden_states.original_shape
 
-        if hidden_states.is_nested:
-            attention_mask = None
+            if hidden_states.is_nested:
+                attention_mask = None
 
-        if attention_mask is not None:
-            # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
-            # 0->false->keep this token -inf->true->mask this token
-            attention_mask = attention_mask.squeeze(1)[:, 0]
-            attention_mask = attention_mask.bool()
-            attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
-            hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
-            attention_mask = None
+            if attention_mask is not None:
+                # attention mask comes in with values 0 and -inf. we convert to torch.nn.TransformerEncoder style bool mask
+                # 0->false->keep this token -inf->true->mask this token
+                attention_mask = attention_mask.squeeze(1)[:, 0]
+                attention_mask = attention_mask.bool()
+                attention_mask = torch.reshape(attention_mask, (attention_mask.shape[0], attention_mask.shape[-1]))
+                hidden_states = torch._nested_tensor_from_mask(hidden_states, ~attention_mask)
+                attention_mask = None
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+            if not self.is_last_layer:
+                hidden_states.original_shape = original_shape
+            elif hidden_states.is_nested and self.is_last_layer:
+                hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
+        else:
+            raise ValueError(
+                "Training and Autocast are not implemented for BetterTransformer + ProphetNet. Please open an issue."
+            )
 
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
-        if not self.is_last_layer:
-            hidden_states.original_shape = original_shape
-        elif hidden_states.is_nested and self.is_last_layer:
-            hidden_states = hidden_states.to_padded_tensor(0.0, original_shape)
         return (hidden_states,)
 
 
@@ -1502,39 +1763,38 @@ def __init__(self, layer, config):
         self.validate_bettertransformer()
 
     def forward(self, hidden_states, attention_mask, *_, **__):
-        r"""
-        This is just a wrapper around the forward function proposed in:
-        https://github.com/huggingface/transformers/pull/19553
-        """
-        super().forward_checker()
-
-        # we expect attention_mask to be None in the vision model
-        if attention_mask is not None:
-            raise ValueError(
-                "Please do not use attention masks when using `BetterTransformer` converted vision models"
+        if not self.training and not torch.is_autocast_enabled() and not torch.is_autocast_cpu_enabled():
+            # we expect attention_mask to be None in the vision model
+            if attention_mask is not None:
+                raise ValueError(
+                    "Please do not use attention masks when using `BetterTransformer` converted vision models"
+                )
+
+            hidden_states = torch._transformer_encoder_layer_fwd(
+                hidden_states,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.out_proj_weight,
+                self.out_proj_bias,
+                self.use_gelu,
+                self.norm_first,
+                self.norm1_eps,
+                self.norm1_weight,
+                self.norm1_bias,
+                self.norm2_weight,
+                self.norm2_bias,
+                self.linear1_weight,
+                self.linear1_bias,
+                self.linear2_weight,
+                self.linear2_bias,
+                attention_mask,
+            )
+        else:
+            NotImplementedError(
+                "Training and Autocast are not implemented for BetterTransformer + CLIP. Please open an issue."
             )
-
-        hidden_states = torch._transformer_encoder_layer_fwd(
-            hidden_states,
-            self.embed_dim,
-            self.num_heads,
-            self.in_proj_weight,
-            self.in_proj_bias,
-            self.out_proj_weight,
-            self.out_proj_bias,
-            self.use_gelu,
-            self.norm_first,
-            self.norm1_eps,
-            self.norm1_weight,
-            self.norm1_bias,
-            self.norm2_weight,
-            self.norm2_bias,
-            self.linear1_weight,
-            self.linear1_bias,
-            self.linear2_weight,
-            self.linear2_bias,
-            attention_mask,
-        )
 
         return (hidden_states,)
 
diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index c0d6c734aa..feba07a172 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -231,12 +231,9 @@ def transform(
                 f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}."
             )
 
-        # check on 1.14 in case there is any more patch release on 1.13
-        if BetterTransformerManager.requires_torch_20(model.config.model_type) and parse(torch.__version__) <= parse(
-            "1.14"
-        ):
+        if parse(torch.__version__) <= parse("1.14"):
             raise ValueError(
-                f"BetterTransformer for {model.config.model_type} requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch."
+                f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch."
             )
 
         hf_config = model.config
@@ -245,6 +242,8 @@ def transform(
             # Remove the hooks from the original model to avoid weights being on `meta` device.
             remove_hook_from_module(model, recurse=True)
 
+        training_mode = model.training
+
         if keep_original_model:
             try:
                 if not check_if_pytorch_greater(2.0, "Please upgrade PyTorch to >=2.0 to use training mode"):
@@ -258,9 +257,9 @@ def transform(
                     " `keep_original_model=False` and create a new copy of the original"
                     " model somewhere else."
                 )
-            model_fast = replace_to_bettertransformer(model_fast, hf_config).eval()
+            model_fast = replace_to_bettertransformer(model_fast, hf_config)
         else:
-            model_fast = replace_to_bettertransformer(model, hf_config).eval()
+            model_fast = replace_to_bettertransformer(model, hf_config)
             model = None
 
         if BetterTransformerManager.requires_nested_tensor(model_fast.config.model_type):
@@ -290,11 +289,11 @@ def transform(
                 model = dispatch_model(model, hf_device_map, offload_dir=offload_dir)
 
         # See: https://github.com/pytorch/pytorch/issues/96099
-        if BetterTransformerManager.requires_torch_20(model_fast.config.model_type):
-            logging.warning(
-                f"For training, the BetterTransformer implementation for {model_fast.config.model_type} "
-                " architecture currently does not support padding as fused kernels do not support custom"
-                " attention masks. Beware that passing padded batched training data may result in unexpected outputs."
+        if model_fast.config.model_type in BetterTransformerManager.DO_NOT_SUPPORT_PADDED_TRAINING:
+            logger.warning(
+                f"For decoder models (here {model_fast.config.model_type}), the BetterTransformer implementation"
+                " does not support padding during training, as the fused kernels do not support"
+                " attention masks. Beware that passing padded batched data during training may result in unexpected outputs."
             )
 
         # Overwrite the `save_pretrained` method
@@ -306,6 +305,11 @@ def transform(
         model_fast.save_pretrained = raise_save_or_push_incompatible
         model_fast.push_to_hub = raise_save_or_push_incompatible
 
+        if training_mode:
+            model_fast = model_fast.train()
+        else:
+            model_fast = model_fast.eval()
+
         return model_fast
 
     def reverse(bt_model: "PreTrainedModel") -> "PreTrainedModel":
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 1d1177ae72..e48a128051 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -23,7 +23,6 @@
 from typing import Any, Callable, Dict, Iterable, Optional, Tuple
 
 import torch
-from packaging.version import parse
 
 from . import is_accelerate_available, is_diffusers_available
 
@@ -63,11 +62,6 @@ def require_torch_gpu(test_case):
     return unittest.skipUnless(torch_device == "cuda", "test requires CUDA")(test_case)
 
 
-def require_torch_20(test_case):
-    """Decorator marking a test that requires torch>=2.0."""
-    return unittest.skipUnless(parse(torch.__version__) > parse("1.14"), "test requires torch>=2.0")(test_case)
-
-
 def require_hf_token(test_case):
     """
     Decorator marking a test that requires huggingface hub token.
diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py
index 95f74de491..595bf6c5a4 100644
--- a/tests/bettertransformer/test_audio.py
+++ b/tests/bettertransformer/test_audio.py
@@ -21,7 +21,7 @@
 from transformers import AutoFeatureExtractor, AutoModel, AutoProcessor
 
 from optimum.bettertransformer import BetterTransformer
-from optimum.utils.testing_utils import grid_parameters, require_torch_20
+from optimum.utils.testing_utils import grid_parameters
 
 
 ALL_AUDIO_MODELS_TO_TEST = [
@@ -64,21 +64,16 @@ def prepare_inputs_for_class(self, model_id, model_type):
         return input_dict
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_model_logits(
@@ -165,24 +160,7 @@ def test_logits(self, model_type: str):
                     ),
                 )
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_autocast(self, model_type: str):
-        model_ids = (
-            MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
-        )
-        for model_id in model_ids:
-            self._test_raise_autocast(model_id, model_type=model_type)
-
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_train(self, model_type: str):
-        model_ids = (
-            MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
-        )
-        for model_id in model_ids:
-            self._test_raise_train(model_id, model_type=model_type)
-
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
         if model_type in ["hubert", "wav2vec2"] and keep_original_model is True:
             self.skipTest(f"{model_type} does not support keep_original_model=True")
@@ -194,7 +172,6 @@ def test_invert_modules(self, test_name: str, model_type: str, keep_original_mod
             self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
         if model_type in ["hubert", "wav2vec2"] and keep_original_model is True:
             self.skipTest(f"{model_type} does not support keep_original_model=True")
@@ -206,7 +183,6 @@ def test_save_load_invertible(self, test_name: str, model_type: str, keep_origin
             self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
         if model_type == "hubert" and keep_original_model is True:
             self.skipTest("hubert does not support keep_original_model=True")
diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py
index ffcd413f4a..e34923730c 100644
--- a/tests/bettertransformer/test_common.py
+++ b/tests/bettertransformer/test_common.py
@@ -16,23 +16,17 @@
 import unittest
 from unittest.mock import patch
 
-import torch
 import transformers
-from packaging.version import parse
 from parameterized import parameterized
 from testing_utils import MODELS_DICT
 from transformers import AutoModel
 
 from optimum.bettertransformer import BetterTransformer, BetterTransformerManager
 from optimum.pipelines import pipeline
-from optimum.utils.testing_utils import grid_parameters, require_torch_20
+from optimum.utils.testing_utils import grid_parameters
 
 
 class BetterTransformerIntegrationTests(unittest.TestCase):
-    def _skip_on_torch_version(self, model_type: str):
-        if BetterTransformerManager.requires_torch_20(model_type) and parse(torch.__version__) < parse("1.14"):
-            self.skipTest(f"The model type {model_type} require PyTorch 2.0 for BetterTransformer")
-
     def test_raise_error_on_double_transform_call(self):
         model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
 
@@ -60,7 +54,6 @@ def test_raise_on_save(self, model_type: str):
         r"""
         Test if the conversion properly raises an error if someone tries to save the model using `save_pretrained`.
         """
-        self._skip_on_torch_version(model_type)
         model_ids = (
             MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
         )
@@ -76,7 +69,6 @@ def test_conversion(self, model_type: str):
         This tests if the conversion of a slow model to its BetterTransformer version using fastpath
         has been successful.
         """
-        self._skip_on_torch_version(model_type)
         model_ids = (
             MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
         )
@@ -93,13 +85,11 @@ def test_conversion(self, model_type: str):
             self.assertTrue(hasattr(converted_model, "generate"))
 
     @parameterized.expand(grid_parameters({"model_type": MODELS_DICT.keys(), "keep_original_model": [True, False]}))
-    @require_torch_20
     def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep_original_model: bool):
         r"""
         Test if the converted model raises an error when calling `save_pretrained`
         but not when the model is reverted
         """
-        self._skip_on_torch_version(model_type)
         if model_type in ["wav2vec2", "hubert"] and keep_original_model is True:
             self.skipTest("These architectures do not support deepcopy")
 
@@ -125,7 +115,6 @@ def test_raise_activation_fun(self, model_type: str):
         A tests that checks if the conversion raises an error if the model contains an activation function
         that is not supported by `BetterTransformer`. Here we need to loop over the config files
         """
-        self._skip_on_torch_version(model_type)
         if BetterTransformerManager.requires_strict_validation(model_type) is False:
             self.skipTest("The architecture does not require a specific activation function")
 
diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py
index 3fb92ab126..d3dedbe7e5 100644
--- a/tests/bettertransformer/test_decoder.py
+++ b/tests/bettertransformer/test_decoder.py
@@ -23,7 +23,7 @@
 
 from optimum.bettertransformer import BetterTransformer
 from optimum.utils import DummyPastKeyValuesGenerator, NormalizedConfigManager
-from optimum.utils.testing_utils import grid_parameters, require_accelerate, require_torch_20, require_torch_gpu
+from optimum.utils.testing_utils import grid_parameters, require_accelerate, require_torch_gpu
 
 
 class BetterTransformersDecoderTest(BetterTransformersTestMixin, unittest.TestCase):
@@ -34,7 +34,9 @@ class BetterTransformersDecoderTest(BetterTransformersTestMixin, unittest.TestCa
         "keep_original_model": [True, False],
     }
 
-    def prepare_inputs_for_class(self, model_id: str, model_type: str, batch_size: int = 2, **preprocessor_kwargs):
+    def prepare_inputs_for_class(
+        self, model_id: str, model_type: str, batch_size: int = 2, no_padding: bool = False, **preprocessor_kwargs
+    ):
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
             if tokenizer.eos_token != "":
@@ -45,6 +47,8 @@ def prepare_inputs_for_class(self, model_id: str, model_type: str, batch_size: i
         padding = preprocessor_kwargs.pop("padding", True)
         if batch_size == 1:
             texts = ["a dummy input yeah!"]
+        elif no_padding:
+            texts = ["a dummy input yeah!"] * batch_size
         else:
             texts = ["a dummy input yeah!"] + ["and two"] * (batch_size - 1)
         inputs = tokenizer(texts, return_tensors="pt", padding=padding, max_length=20, **preprocessor_kwargs)
@@ -61,13 +65,24 @@ def prepare_inputs_for_class(self, model_id: str, model_type: str, batch_size: i
         )
     )
     def test_logits_without_cache(self, test_name: str, model_type: str, padding, batch_size: int):
-        self._skip_on_torch_version(model_type)
         if batch_size == 1 and padding == "max_length":
             self.skipTest("batch_size=1 + padding='max_length' is unsupported")
 
         model_id = MODELS_DICT[model_type]
         self._test_logits(model_id, model_type=model_type, padding=padding, batch_size=batch_size)
 
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_type": SUPPORTED_ARCH,
+                "batch_size": [1, 3],
+            }
+        )
+    )
+    def test_logits_backward(self, test_name: str, model_type: str, batch_size: int):
+        model_id = MODELS_DICT[model_type]
+        self._test_logits_backward(model_id, model_type=model_type, no_padding=True, batch_size=batch_size)
+
     @parameterized.expand(
         grid_parameters(
             {
@@ -81,8 +96,6 @@ def test_logits_without_cache(self, test_name: str, model_type: str, padding, ba
     @require_torch_gpu
     @pytest.mark.gpu_test
     def test_fp16_inference(self, test_name: str, model_type: str, use_to_operator: bool, batch_size: int):
-        self._skip_on_torch_version(model_type)
-
         model_id = MODELS_DICT[model_type]
         self._test_fp16_inference(
             model_id,
@@ -101,7 +114,6 @@ def test_fp16_inference(self, test_name: str, model_type: str, use_to_operator:
         )
     )
     def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: int):
-        self._skip_on_torch_version(model_type)
         input_ids = torch.randint(low=1, high=10, size=(batch_size, 1))
         seq_length = 12
         attention_mask = torch.ones(batch_size, seq_length + 1, dtype=torch.int32)
@@ -135,7 +147,6 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in
         grid_parameters({"model_type": SUPPORTED_ARCH, "batch_size": [1, 3], "padding": [True, "max_length"]})
     )
     def test_generation(self, test_name: str, model_type: str, batch_size: int, padding: str):
-        self._skip_on_torch_version(model_type)
         if batch_size == 1 and padding == "max_length":
             self.skipTest("batch_size=1 + padding='max_length' is unsupported")
 
@@ -167,35 +178,23 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
             f" Maxdiff: {(result_vanilla - result_bettertransformer).abs().max()}",
         )
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_autocast(self, model_type: str):
-        self._skip_on_torch_version(model_type)
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_autocast(model_id, model_type=model_type)
-
     @parameterized.expand(SUPPORTED_ARCH)
     @pytest.mark.training
     def test_train(self, model_type: str):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_train_decoder(model_id, model_type=model_type)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_model_logits(
diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py
index b6859450cd..1a152c7f6e 100644
--- a/tests/bettertransformer/test_encoder.py
+++ b/tests/bettertransformer/test_encoder.py
@@ -20,10 +20,11 @@
 import transformers
 from parameterized import parameterized
 from testing_utils import MODELS_DICT, BetterTransformersTestMixin
-from transformers import AutoModel
+from transformers import AutoModel, AutoProcessor, AutoTokenizer
 
 from optimum.bettertransformer import BetterTransformer
-from optimum.utils.testing_utils import grid_parameters, require_accelerate, require_torch_20, require_torch_gpu
+from optimum.pipelines import pipeline
+from optimum.utils.testing_utils import grid_parameters, require_accelerate, require_torch_gpu
 
 
 class BetterTransformersEncoderTest(BetterTransformersTestMixin):
@@ -64,12 +65,27 @@ class BetterTransformersEncoderTest(BetterTransformersTestMixin):
     def tearDown(self):
         gc.collect()
 
-    def prepare_inputs_for_class(self, model_id, model_type):
-        input_dict = {
-            "input_ids": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]),
-            "attention_mask": torch.LongTensor([[1, 1, 1, 1, 1, 1], [1, 1, 1, 0, 0, 0]]),
-        }
-        return input_dict
+    def prepare_inputs_for_class(self, model_id: str, model_type: str, batch_size: int = 2, **preprocessor_kwargs):
+        # TODO: remove the need for tokenizer
+        if model_type == "markuplm":
+            preprocessor = AutoProcessor.from_pretrained(model_id)
+        else:
+            preprocessor = AutoTokenizer.from_pretrained(model_id)
+        if batch_size == 1:
+            texts = ["a dummy input yeah yeah!"]
+        else:
+            texts = ["a dummy input yeah yeah!"] + ["and two"] * (batch_size - 1)
+
+        padding = preprocessor_kwargs.pop("padding", True)
+        if padding == "max_length":
+            max_length = 25
+        else:
+            max_length = None
+
+        inputs = preprocessor(
+            texts, return_tensors="pt", padding=padding, max_length=max_length, **preprocessor_kwargs
+        )
+        return inputs
 
     def test_raise_pos_emb(self):
         r"""
@@ -129,8 +145,6 @@ def test_pipeline_on_cpu(self):
         r"""
         This test runs pipeline together with Better Transformers converted models using optimum `pipeline`.
         """
-        from optimum.pipelines import pipeline
-
         model_name = "distilbert-base-uncased"
         unmasker = pipeline("fill-mask", model_name, accelerator="bettertransformer")
 
@@ -145,8 +159,6 @@ def test_pipeline_on_gpu(self):
         r"""
         This test runs pipeline together with Better Transformers converted models using optimum `pipeline`.
         """
-        from optimum.pipelines import pipeline
-
         model_name = "distilbert-base-uncased"
         unmasker = pipeline("fill-mask", model_name, accelerator="bettertransformer", device="cuda:0")
 
@@ -197,21 +209,6 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m
         self.assertTrue(torch.allclose(output_bt[0][1, 3:], torch.zeros_like(output_bt[0][1, 3:])))
         gc.collect()
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_autocast(self, model_type: str):
-        if model_type == "rocbert":
-            self.skipTest(
-                "unrelated issue with torch.amp.autocast with rocbert (expected scalar type BFloat16 but found Float)"
-            )
-
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_autocast(model_id, model_type)
-
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_train(self, model_type: str):
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_train(model_id, model_type)
-
     @pytest.mark.gpu_test
     @pytest.mark.accelerate_test
     def test_accelerate_compatibility_cpu_gpu(self):
@@ -250,21 +247,60 @@ def test_accelerate_compatibility_single_gpu_without_keeping(self):
         max_memory = {0: "2GB"}
         self.check_accelerate_compatibility_cpu_gpu(keep_original_model=False, max_memory=max_memory)
 
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_type": SUPPORTED_ARCH,
+                "batch_size": [1, 3],
+            }
+        )
+    )
+    def test_logits(self, test_name: str, model_type: str, batch_size: int):
+        # TODO: enable those tests
+        if model_type in ["rocbert", "splinter", "markuplm", "bert-generation"]:
+            self.skipTest(f"tiny tokenizers are broken on the Hub {model_type}")
+        if model_type in ["tapas"]:
+            self.skipTest(f"{model_type} requires dataframe")
+
+        model_id = MODELS_DICT[model_type]
+        self._test_logits(model_id=model_id, model_type=model_type, batch_size=batch_size)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_type": SUPPORTED_ARCH,
+                "batch_size": [1, 3],
+            }
+        )
+    )
+    def test_logits_backward(self, test_name: str, model_type: str, batch_size: int):
+        # TODO: enable those tests
+        if model_type in ["rocbert", "splinter", "markuplm", "bert-generation"]:
+            self.skipTest(f"tiny tokenizer is broken on the Hub for {model_type}")
+        if model_type in ["tapas"]:
+            self.skipTest(f"{model_type} requires dataframe")
+
+        model_id = MODELS_DICT[model_type]
+        self._test_logits_backward(model_id=model_id, model_type=model_type, batch_size=batch_size)
+
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
+        # TODO: reenable those tests
+        if model_type in ["rocbert", "splinter", "markuplm", "bert-generation"]:
+            self.skipTest(f"tiny tokenizers are broken on the Hub {model_type}")
+        if model_type in ["tapas"]:
+            self.skipTest(f"{model_type} requires dataframe")
+
         model_id = MODELS_DICT[model_type]
         self._test_invert_model_logits(
             model_id=model_id, model_type=model_type, keep_original_model=keep_original_model
diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py
index 44173e9267..df74ed03d2 100644
--- a/tests/bettertransformer/test_encoder_decoder.py
+++ b/tests/bettertransformer/test_encoder_decoder.py
@@ -22,7 +22,7 @@
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 
 from optimum.bettertransformer import BetterTransformer
-from optimum.utils.testing_utils import grid_parameters, require_torch_20, require_torch_gpu
+from optimum.utils.testing_utils import grid_parameters, require_torch_gpu
 
 
 class BetterTransformersEncoderDecoderTest(BetterTransformersTestMixin, unittest.TestCase):
@@ -71,39 +71,35 @@ def prepare_inputs_for_class(self, model_id, model_type, **preprocessor_kwargs):
         )
     )
     def test_logits_without_cache(self, test_name: str, model_type: str, padding, max_length=20):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
         self._test_logits(model_id, model_type=model_type, padding=padding, max_length=max_length)
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_autocast(self, model_type: str):
-        self._skip_on_torch_version(model_type)
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_autocast(model_id, model_type=model_type)
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_type": SUPPORTED_ARCH,
+                "padding": ["max_length", True],
+            }
+        )
+    )
+    def test_logits_backward(self, test_name: str, model_type: str, padding, max_length=20):
+        if model_type in ["fsmt", "prophetnet"]:
+            self.skipTest(f"Training support not implemented for {model_type}")
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_train(self, model_type: str):
-        self._skip_on_torch_version(model_type)
         model_id = MODELS_DICT[model_type]
-        if model_type not in ["blenderbot", "pegasus", "t5"]:
-            self._test_raise_train(model_id, model_type=model_type)
-        else:
-            self._test_train_decoder(model_id, model_type=model_type)
+        self._test_logits_backward(model_id, model_type=model_type, padding=padding, max_length=max_length)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_model_logits(
@@ -122,8 +118,6 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina
     @require_torch_gpu
     @pytest.mark.gpu_test
     def test_fp16_inference(self, test_name: str, model_type: str, use_to_operator: bool):
-        self._skip_on_torch_version(model_type)
-
         # TODO: fix in transformers
         if model_type == "fsmt":
             self.skipTest("fsmt is broken is transformers when loaded through torch_dtype=torch.float16")
@@ -137,7 +131,6 @@ def test_fp16_inference(self, test_name: str, model_type: str, use_to_operator:
         grid_parameters({"model_type": SUPPORTED_ARCH, "batch_size": [1, 3], "padding": [True, "max_length"]})
     )
     def test_generation(self, test_name: str, model_type: str, batch_size: int, padding: str):
-        self._skip_on_torch_version(model_type)
         if batch_size == 1 and padding == "max_length":
             self.skipTest("batch_size=1 + padding='max_length' is unsupported")
 
diff --git a/tests/bettertransformer/test_vision.py b/tests/bettertransformer/test_vision.py
index 025c539330..ea04936fab 100644
--- a/tests/bettertransformer/test_vision.py
+++ b/tests/bettertransformer/test_vision.py
@@ -20,7 +20,7 @@
 from testing_utils import MODELS_DICT, BetterTransformersTestMixin
 from transformers import AutoFeatureExtractor, AutoProcessor
 
-from optimum.utils.testing_utils import grid_parameters, require_torch_20
+from optimum.utils.testing_utils import grid_parameters
 
 
 class BetterTransformersVisionTest(BetterTransformersTestMixin, unittest.TestCase):
@@ -73,18 +73,6 @@ def test_logits(self, model_type: str):
         model_id = MODELS_DICT[model_type]
         self._test_logits(model_id, model_type=model_type)
 
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_autocast(self, model_type: str):
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_autocast(model_id, model_type=model_type)
-
-    @parameterized.expand(SUPPORTED_ARCH)
-    def test_raise_train(self, model_type: str):
-        if model_type in ["blip-2"]:
-            self.skipTest("can be trained")
-        model_id = MODELS_DICT[model_type]
-        self._test_raise_train(model_id, model_type=model_type)
-
     @parameterized.expand(
         grid_parameters(
             {
@@ -93,7 +81,6 @@ def test_raise_train(self, model_type: str):
             }
         )
     )
-    @require_torch_20
     def test_invert_modules(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_modules(model_id=model_id, keep_original_model=keep_original_model)
@@ -106,7 +93,6 @@ def test_invert_modules(self, test_name: str, model_type: str, keep_original_mod
             }
         )
     )
-    @require_torch_20
     def test_save_load_invertible(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_save_load_invertible(model_id=model_id, keep_original_model=keep_original_model)
@@ -119,7 +105,6 @@ def test_save_load_invertible(self, test_name: str, model_type: str, keep_origin
             }
         )
     )
-    @require_torch_20
     def test_invert_model_logits(self, test_name: str, model_type: str, keep_original_model=False):
         model_id = MODELS_DICT[model_type]
         self._test_invert_model_logits(
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index 8b908fa905..c63d5d241e 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -19,10 +19,9 @@
 import unittest
 
 import torch
-from packaging.version import parse
 from transformers import AutoModel
 
-from optimum.bettertransformer import BetterTransformer, BetterTransformerManager
+from optimum.bettertransformer import BetterTransformer
 from optimum.utils.testing_utils import flatten_dict, require_torch_gpu
 
 
@@ -57,7 +56,7 @@
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
     "prophetnet": "hirotasoshu/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
-    "rembert": "hf-internal-testing/tiny-random-rembert",
+    "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "rocbert": "hf-internal-testing/tiny-random-RoCBertModel",
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
@@ -74,6 +73,33 @@
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
 }
 
+known_dropout_keys = [
+    "attention_probs_dropout_prob",
+    "hidden_dropout_prob",
+    "classifier_dropout_prob",
+    "attention_dropout",
+    "dropout",
+    "qa_dropout",
+    "seq_classif_dropout",
+    "summary_last_dropout",
+    "classifier_dropout",
+    "activation_dropout",
+    "classif_dropout",
+    "dropout_rate",
+    "attn_pdrop",
+    "embd_pdrop",
+    "resid_pdrop",
+    "summary_first_dropout",
+]
+
+
+def set_dropout_to_zero(config):
+    for attr_name in known_dropout_keys:
+        if hasattr(config, attr_name):
+            setattr(config, attr_name, 0.0)
+
+    return config
+
 
 class BetterTransformersTestMixin(unittest.TestCase):
     r"""
@@ -82,18 +108,11 @@ class BetterTransformersTestMixin(unittest.TestCase):
         - `test_logits`: This tests if the converted model produces the same logits
         than the original model.
         - `test_raise_on_save`: Test if the converion properly raises an error if someone tries to save the model using `save_pretrained`.
-        - `test_raise_autocast`: A tests that checks if the conversion raises an error if the model is run under
-        `torch.cuda.amp.autocast`.
-        - `test_raise_train`: A tests that checks if the conversion raises an error if the model is run in training mode.
     """
 
     def prepare_inputs_for_class(self, model_id=None, model_type=None):
         raise NotImplementedError
 
-    def _skip_on_torch_version(self, model_type: str):
-        if BetterTransformerManager.requires_torch_20(model_type) and parse(torch.__version__) < parse("1.14"):
-            self.skipTest(f"The model type {model_type} require PyTorch 2.0 for BetterTransformer")
-
     @require_torch_gpu
     def _test_fp16_inference(
         self, model_id: str, model_type: str, automodel_class, use_to_operator=False, **preprocessor_kwargs
@@ -136,6 +155,66 @@ def _test_fp16_inference(
                 f"Maxdiff: {(output_hf - output_bt).abs().max()}",
             )
 
+    def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs):
+        inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs)
+
+        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        random_config = hf_random_model.config
+
+        # I could not obtain reproducible results with `torch.manual_seed` nor with
+        # `torch.random.set_rng_state`. An alternative could be to make dropout stateful,
+        # and to replace them with a static pattern for this test. Currently, we use
+        # functional dropout though.
+        # We need to be in train mode to take the right path.
+        random_config = set_dropout_to_zero(random_config)
+
+        # m2m_100 randomly drops layers, which makes testing flaky (see `skip_the_layer` in transformers, some other models use it as well)
+        if model_type == "m2m_100":
+            random_config.encoder_layerdrop = 0
+            random_config.decoder_layerdrop = 0
+
+        hf_random_model = hf_random_model.__class__(random_config)
+
+        converted_model = copy.deepcopy(hf_random_model)
+        converted_model = BetterTransformer.transform(converted_model)
+
+        hf_random_model = hf_random_model.train()
+        converted_model = converted_model.train()
+
+        optimizer_hf = torch.optim.SGD(hf_random_model.parameters(), lr=0.2)
+        optimizer_bt = torch.optim.SGD(converted_model.parameters(), lr=0.2)
+
+        tol = 2e-3
+
+        hf_hidden_states = hf_random_model(**inputs)[0]
+        bt_hidden_states = converted_model(**inputs)[0]
+
+        self.assert_equal(
+            hf_hidden_states,
+            bt_hidden_states,
+            atol=tol,
+            model_name=hf_random_model.__class__.__name__,
+        )
+
+        loss_hf = hf_hidden_states.abs().mean()
+        loss_bt = bt_hidden_states.abs().mean()
+
+        loss_hf.backward()
+        loss_bt.backward()
+
+        optimizer_hf.step()
+        optimizer_bt.step()
+
+        hf_hidden_states = hf_random_model(**inputs)[0]
+        bt_hidden_states = converted_model(**inputs)[0]
+
+        self.assert_equal(
+            hf_hidden_states,
+            bt_hidden_states,
+            atol=tol,
+            model_name=hf_random_model.__class__.__name__,
+        )
+
     def _test_logits(self, model_id: str, model_type: str, **preprocessor_kwargs):
         r"""
         This tests if the converted model produces the same logits
@@ -148,9 +227,13 @@ def _test_logits(self, model_id: str, model_type: str, **preprocessor_kwargs):
         hf_random_model = AutoModel.from_pretrained(model_id).eval()
         random_config = hf_random_model.config
 
+        hf_random_model = hf_random_model.eval()
+
         torch.manual_seed(0)
         converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
 
+        self.assertFalse(hf_random_model.training)
+        self.assertFalse(converted_model.training)
         self.assertFalse(
             hasattr(hf_random_model, "use_bettertransformer"),
             f"The model {hf_random_model.__class__.__name__} has been converted to a `fast` model by mistake.",
@@ -209,33 +292,6 @@ def assert_equal(self, tensor1, tensor2, atol: float, model_name: str):
             f" Maxdiff: {torch.abs(tensor1 - tensor2).max()}",
         )
 
-    def _test_raise_autocast(self, model_id: str, model_type: str, **kwargs):
-        r"""
-        A tests that checks if the conversion raises an error if the model is run under
-        `torch.cuda.amp.autocast`.
-        """
-        inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs)
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
-
-        # Check for the autocast on CPU
-        with self.assertRaises(ValueError), torch.amp.autocast("cpu"):
-            bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
-            _ = bt_model(**inputs)
-
-    def _test_raise_train(self, model_id: str, model_type: str, **kwargs):
-        r"""
-        A tests that checks if the conversion raises an error if the model is run under
-        `model.train()`.
-        """
-        inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs)
-
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
-        # Check for training mode
-        with self.assertRaises(ValueError):
-            bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
-            bt_model.train()
-            _ = bt_model(**inputs)
-
     def _test_train_decoder(self, model_id: str, model_type: str, **kwargs):
         r"""
         A tests that checks if the training works as expected for decoder models.