From 2f1fe6b1f5ad786653b8e221e0c2e56e51f3aa6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix=20Marty?=
 <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 28 Aug 2023 13:41:14 +0200
Subject: [PATCH] fix

---
 optimum/onnxruntime/modeling_diffusion.py     | 260 +-----------------
 .../diffusers/pipeline_stable_diffusion.py    | 142 +++++-----
 .../pipeline_stable_diffusion_img2img.py      | 130 ++++-----
 .../pipeline_stable_diffusion_inpaint.py      | 139 +++++-----
 .../diffusers/pipeline_stable_diffusion_xl.py | 139 +++++-----
 .../pipeline_stable_diffusion_xl_img2img.py   | 144 +++++-----
 6 files changed, 346 insertions(+), 608 deletions(-)

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 16157866e0..5a90d8953e 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -40,21 +40,11 @@
 
 from ..exporters.onnx import main_export
 from ..onnx.utils import _get_external_data_paths
-from ..pipelines.diffusers.pipeline_stable_diffusion import (
-    StableDiffusionPipelineMixin,
-)
-from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import (
-    StableDiffusionImg2ImgPipelineMixin,
-)
-from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import (
-    StableDiffusionInpaintPipelineMixin,
-)
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl import (
-    StableDiffusionXLPipelineMixin,
-)
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import (
-    StableDiffusionXLImg2ImgPipelineMixin,
-)
+from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
+from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
 from ..utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
@@ -551,48 +541,6 @@ class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusion
     """
 
     __call__ = StableDiffusionPipelineMixin.__call__
-    """
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_CALL_DOCSTRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        guidance_rescale: float = 0.0,
-    ):
-        return StableDiffusionPipelineMixin.__call__(
-            prompt,
-            height,
-            width,
-            num_inference_steps,
-            guidance_scale,
-            negative_prompt,
-            num_images_per_prompt,
-            eta,
-            generator,
-            latents,
-            prompt_embeds,
-            negative_prompt_embeds,
-            output_type,
-            return_dict,
-            callback,
-            callback_steps,
-            guidance_rescale,
-        )
-    """
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
@@ -603,45 +551,6 @@ class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDi
 
     __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
 
-    """
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_IMG2IMG_CALL_DOCSTRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        StableDiffusionImg2ImgPipelineMixin.__call__(
-            prompt,
-            image,
-            strength,
-            num_inference_steps,
-            guidance_scale,
-            negative_prompt,
-            num_images_per_prompt,
-            eta,
-            generator,
-            prompt_embeds,
-            negative_prompt_embeds,
-            output_type,
-            return_dict,
-            callback,
-            callback_steps,
-        )
-    """
-
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
@@ -651,51 +560,6 @@ class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDi
 
     __call__ = StableDiffusionInpaintPipelineMixin.__call__
 
-    """
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_INPAINT_CALL_DOCSTRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: PIL.Image.Image,
-        mask_image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        StableDiffusionInpaintPipelineMixin.__call__(
-            prompt,
-            image,
-            mask_image,
-            height,
-            width,
-            num_inference_steps,
-            guidance_scale,
-            negative_prompt,
-            num_images_per_prompt,
-            eta,
-            generator,
-            latents,
-            prompt_embeds,
-            negative_prompt_embeds,
-            output_type,
-            return_dict,
-            callback,
-            callback_steps,
-        )
-    """
-
 
 class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
     auto_model_class = StableDiffusionXLImg2ImgPipeline
@@ -744,61 +608,6 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu
 
     __call__ = StableDiffusionXLPipelineMixin.__call__
 
-    """
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_XL_CALL_DOCSTRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-    ):
-        StableDiffusionXLPipelineMixin.__call__(
-            prompt,
-            height,
-            width,
-            num_inference_steps,
-            guidance_scale,
-            negative_prompt,
-            num_images_per_prompt,
-            eta,
-            generator,
-            latents,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-            output_type,
-            return_dict,
-            callback,
-            callback_steps,
-            cross_attention_kwargs,
-            guidance_rescale,
-            original_size,
-            crops_coords_top_left,
-            target_size,
-        )
-    """
-
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
@@ -807,62 +616,3 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab
     """
 
     __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
-
-    """
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_XL_IMG2IMG_CALL_DOCSTRING)
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.3,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        aesthetic_score: float = 6.0,
-        negative_aesthetic_score: float = 2.5,
-    ):
-        StableDiffusionXLImg2ImgPipelineMixin.__call__(
-            prompt,
-            image,
-            strength,
-            num_inference_steps,
-            guidance_scale,
-            negative_prompt,
-            num_images_per_prompt,
-            eta,
-            generator,
-            latents,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-            output_type,
-            return_dict,
-            callback,
-            callback_steps,
-            cross_attention_kwargs,
-            guidance_rescale,
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            aesthetic_score,
-            negative_aesthetic_score,
-        )
-    """
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
index eb205f9010..0f5b3c3b33 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -19,7 +19,6 @@
 import numpy as np
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from transformers.file_utils import add_end_docstrings
 
 from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
 
@@ -27,75 +26,6 @@
 logger = logging.getLogger(__name__)
 
 
-STABLE_DIFFUSION_PIPELINE_CALL_DOCSTRING = """
-Function invoked when calling the pipeline for generation.
-
-Args:
-    prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-        instead.
-    height (`Optional[int]`, defaults to `None`):
-        The height in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`
-    width (`Optional[int]`, defaults to `None`):
-        The width in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`
-    num_inference_steps (`int`, defaults to 50):
-        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-        expense of slower inference.
-    guidance_scale (`float`, defaults to 7.5):
-        Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-        `guidance_scale` is defined as `w` of equation 2. of [Imagen
-        Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-        1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-        usually at the expense of lower image quality.
-    negative_prompt (`Optional[Union[str, list]]`):
-        The prompt or prompts not to guide the image generation. If not defined, one has to pass
-        `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-        is less than `1`).
-    num_images_per_prompt (`int`, defaults to 1):
-        The number of images to generate per prompt.
-    eta (`float`, defaults to 0.0):
-        Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-        [`schedulers.DDIMScheduler`], will be ignored for others.
-    generator (`Optional[np.random.RandomState]`, defaults to `None`):
-        A np.random.RandomState to make generation deterministic.
-    latents (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-        tensor will ge generated by sampling using the supplied random `generator`.
-    prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-        provided, text embeddings will be generated from `prompt` input argument.
-    negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-        argument.
-    output_type (`str`, defaults to `"pil"`):
-        The output format of the generate image. Choose between
-        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-    return_dict (`bool`, defaults to `True`):
-        Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-        plain tuple.
-    callback (Optional[Callable], defaults to `None`):
-        A function that will be called every `callback_steps` steps during inference. The function will be
-        called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-    callback_steps (`int`, defaults to 1):
-        The frequency at which the `callback` function will be called. If not specified, the callback will be
-        called at every step.
-    guidance_rescale (`float`, defaults to 0.0):
-        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-        Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-        [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-        Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-Returns:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-    When returning a tuple, the first element is a list with the generated images, and the second element is a
-    list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-    (nsfw) content, according to the `safety_checker`.
-"""
-
-
 class StableDiffusionPipelineMixin(DiffusionPipelineMixin):
     # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114
     def _encode_prompt(
@@ -205,8 +135,8 @@ def _encode_prompt(
     def check_inputs(
         self,
         prompt: Union[str, List[str]],
-        height: int,
-        width: int,
+        height: Optional[int],
+        width: Optional[int],
         callback_steps: int,
         negative_prompt: Optional[str] = None,
         prompt_embeds: Optional[np.ndarray] = None,
@@ -269,7 +199,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
         return latents
 
     # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_CALL_DOCSTRING)
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
@@ -290,6 +219,73 @@ def __call__(
         callback_steps: int = 1,
         guidance_rescale: float = 0.0,
     ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
         height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
         width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
 
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
index ee9803cb5a..d2c23b2b04 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -21,7 +21,6 @@
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import deprecate
-from transformers.file_utils import add_end_docstrings
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
 from .pipeline_utils import preprocess
@@ -29,69 +28,6 @@
 
 logger = logging.getLogger(__name__)
 
-STABLE_DIFFUSION_PIPELINE_IMG2IMG_CALL_DOCSTRING = """
-Function invoked when calling the pipeline for generation.
-
-Args:
-    prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-        instead.
-    image (`Union[np.ndarray, PIL.Image.Image]`):
-        `Image`, or tensor representing an image batch which will be upscaled.
-    strength (`float`, defaults to 0.8):
-        Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-        will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-        denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-        be maximum and the denoising process will run for the full number of iterations specified in
-        `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-    num_inference_steps (`int`, defaults to 50):
-        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-        expense of slower inference.
-    guidance_scale (`float`, defaults to 7.5):
-        Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-        `guidance_scale` is defined as `w` of equation 2. of [Imagen
-        Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-        1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-        usually at the expense of lower image quality.
-    negative_prompt (`Optional[Union[str, list]]`):
-        The prompt or prompts not to guide the image generation. If not defined, one has to pass
-        `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-        is less than `1`).
-    num_images_per_prompt (`int`, defaults to 1):
-        The number of images to generate per prompt.
-    eta (`float`, defaults to 0.0):
-        Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-        [`schedulers.DDIMScheduler`], will be ignored for others.
-    generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-        A np.random.RandomState to make generation deterministic.
-    prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-        provided, text embeddings will be generated from `prompt` input argument.
-    negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-        argument.
-    output_type (`str`, defaults to `"pil"`):
-        The output format of the generate image. Choose between
-        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-    return_dict (`bool`, defaults to `True`):
-        Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-        plain tuple.
-    callback (Optional[Callable], defaults to `None`):
-        A function that will be called every `callback_steps` steps during inference. The function will be
-        called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-    callback_steps (`int`, defaults to 1):
-        The frequency at which the `callback` function will be called. If not specified, the callback will be
-        called at every step.
-
-Returns:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-    When returning a tuple, the first element is a list with the generated images, and the second element is a
-    list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-    (nsfw) content, according to the `safety_checker`.
-"""
-
 
 class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin):
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs
@@ -142,7 +78,6 @@ def check_inputs(
                 )
 
     # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_IMG2IMG_CALL_DOCSTRING)
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
@@ -161,6 +96,71 @@ def __call__(
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
         callback_steps: int = 1,
     ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`Union[np.ndarray, PIL.Image.Image]`):
+                `Image`, or tensor representing an image batch which will be upscaled.
+            strength (`float`, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
         # check inputs. Raise error if not correct
         self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
 
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
index b88ffe6675..e2a7ac7c9e 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -21,80 +21,12 @@
 import torch
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import PIL_INTERPOLATION
-from transformers.file_utils import add_end_docstrings
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
 
 
 logger = logging.getLogger(__name__)
 
-STABLE_DIFFUSION_PIPELINE_INPAINT_CALL_DOCSTRING = """
-Function invoked when calling the pipeline for generation.
-
-Args:
-    prompt (`Union[str, List[str]]`):
-        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-        instead.
-    image (`PIL.Image.Image`):
-        `Image`, or tensor representing an image batch which will be upscaled.
-    mask_image (`PIL.Image.Image`):
-        `Image`, or tensor representing a masked image batch which will be upscaled.
-    height (`Optional[int]`, defaults to None):
-        The height in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`.
-    width (`Optional[int]`, defaults to None):
-        The width in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`.
-    num_inference_steps (`int`, defaults to 50):
-        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-        expense of slower inference.
-    guidance_scale (`float`, defaults to 7.5):
-        Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-        `guidance_scale` is defined as `w` of equation 2. of [Imagen
-        Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-        1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-        usually at the expense of lower image quality.
-    negative_prompt (`Optional[Union[str, list]]`):
-        The prompt or prompts not to guide the image generation. If not defined, one has to pass
-        `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-        is less than `1`).
-    num_images_per_prompt (`int`, defaults to 1):
-        The number of images to generate per prompt.
-    eta (`float`, defaults to 0.0):
-        Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-        [`schedulers.DDIMScheduler`], will be ignored for others.
-    generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-        A np.random.RandomState to make generation deterministic.
-    latents (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-        tensor will ge generated by sampling using the supplied random `generator`.
-    prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-        provided, text embeddings will be generated from `prompt` input argument.
-    negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-        argument.
-    output_type (`str`, defaults to `"pil"`):
-        The output format of the generate image. Choose between
-        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-    return_dict (`bool`, defaults to `True`):
-        Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-        plain tuple.
-    callback (Optional[Callable], defaults to `None`):
-        A function that will be called every `callback_steps` steps during inference. The function will be
-        called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-    callback_steps (`int`, defaults to 1):
-        The frequency at which the `callback` function will be called. If not specified, the callback will be
-        called at every step.
-
-Returns:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-    [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-    When returning a tuple, the first element is a list with the generated images, and the second element is a
-    list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-    (nsfw) content, according to the `safety_checker`.
-"""
-
 
 def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor):
     image = np.array(
@@ -123,8 +55,8 @@ class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin):
     def check_inputs(
         self,
         prompt: Union[str, List[str]],
-        height: int,
-        width: int,
+        height: Optional[int],
+        width: Optional[int],
         callback_steps: int,
         negative_prompt: Optional[str] = None,
         prompt_embeds: Optional[np.ndarray] = None,
@@ -167,7 +99,6 @@ def check_inputs(
                     f" {negative_prompt_embeds.shape}."
                 )
 
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_INPAINT_CALL_DOCSTRING)
     @torch.no_grad()
     def __call__(
         self,
@@ -190,6 +121,72 @@ def __call__(
         callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
         callback_steps: int = 1,
     ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Union[str, List[str]]`):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be upscaled.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing a masked image batch which will be upscaled.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
         height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
         width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
 
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
index c46de2c3eb..4c8c015fed 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -19,7 +19,6 @@
 import numpy as np
 import torch
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-from transformers.file_utils import add_end_docstrings
 
 from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
 
@@ -27,75 +26,6 @@
 logger = logging.getLogger(__name__)
 
 
-STABLE_DIFFUSION_PIPELINE_XL_CALL_DOCSTRING = """
-Function invoked when calling the pipeline for generation.
-
-Args:
-    prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-        instead.
-    height (`Optional[int]`, defaults to `None`):
-        The height in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`.
-    width (`Optional[int]`, defaults to `None`):
-        The width in pixels of the generated image. If `None`, defaults to `self.unet.config.sample_size * self.vae_scale_factor`.
-    num_inference_steps (`int`, defaults to 50):
-        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-        expense of slower inference.
-    guidance_scale (`float`, defaults to 5):
-        Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-        `guidance_scale` is defined as `w` of equation 2. of [Imagen
-        Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-        1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-        usually at the expense of lower image quality.
-    negative_prompt (`Optional[Union[str, list]]`):
-        The prompt or prompts not to guide the image generation. If not defined, one has to pass
-        `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-        is less than `1`).
-    num_images_per_prompt (`int`, defaults to 1):
-        The number of images to generate per prompt.
-    eta (`float`, defaults to 0.0):
-        Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-        [`schedulers.DDIMScheduler`], will be ignored for others.
-    generator (`Optional[np.random.RandomState]`, defaults to `None`):
-        A np.random.RandomState to make generation deterministic.
-    latents (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-        tensor will ge generated by sampling using the supplied random `generator`.
-    prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-        provided, text embeddings will be generated from `prompt` input argument.
-    negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-        argument.
-    output_type (`str`, defaults to `"pil"`):
-        The output format of the generate image. Choose between
-        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-    return_dict (`bool`, defaults to `True`):
-        Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-        plain tuple.
-    callback (Optional[Callable], defaults to `None`):
-        A function that will be called every `callback_steps` steps during inference. The function will be
-        called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-    callback_steps (`int`, defaults to 1):
-        The frequency at which the `callback` function will be called. If not specified, the callback will be
-        called at every step.
-    guidance_rescale (`float`, defaults to 0.7):
-        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-        Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-        [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-        Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-Returns:
-    [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-    [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-    When returning a tuple, the first element is a list with the generated images, and the second element is a
-    list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-    (nsfw) content, according to the `safety_checker`.
-"""
-
-
 class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin):
     # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def _encode_prompt(
@@ -330,7 +260,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
         return extra_step_kwargs
 
     # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
-    @add_end_docstrings(STABLE_DIFFUSION_PIPELINE_XL_CALL_DOCSTRING)
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
@@ -357,6 +286,74 @@ def __call__(
         crops_coords_top_left: Tuple[int, int] = (0, 0),
         target_size: Optional[Tuple[int, int]] = None,
     ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`Optional[int]`, defaults to None):
+                The height in pixels of the generated image.
+            width (`Optional[int]`, defaults to None):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
         # 0. Default height and width to unet
         height = height or self.unet.config["sample_size"] * self.vae_scale_factor
         width = width or self.unet.config["sample_size"] * self.vae_scale_factor
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
index 52cb114fd4..4a2b48d38e 100644
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -27,79 +27,6 @@
 logger = logging.getLogger(__name__)
 
 
-STABLE_DIFFUSION_PIPELINE_XL_IMG2IMG_CALL_DOCSTRING = """
-Function invoked when calling the pipeline for generation.
-
-Args:
-    prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-        instead.
-    image (`Union[np.ndarray, PIL.Image.Image]`):
-        `Image`, or tensor representing an image batch which will be upscaled.
-    strength (`float`, defaults to 0.8):
-        Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-        will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-        denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-        be maximum and the denoising process will run for the full number of iterations specified in
-        `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-    num_inference_steps (`int`, defaults to 50):
-        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-        expense of slower inference.
-    guidance_scale (`float`, defaults to 5):
-        Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-        `guidance_scale` is defined as `w` of equation 2. of [Imagen
-        Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-        1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-        usually at the expense of lower image quality.
-    negative_prompt (`Optional[Union[str, list]]`):
-        The prompt or prompts not to guide the image generation. If not defined, one has to pass
-        `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-        is less than `1`).
-    num_images_per_prompt (`int`, defaults to 1):
-        The number of images to generate per prompt.
-    eta (`float`, defaults to 0.0):
-        Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-        [`schedulers.DDIMScheduler`], will be ignored for others.
-    generator (`Optional[np.random.RandomState]`, defaults to `None`):
-        A np.random.RandomState to make generation deterministic.
-    latents (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-        tensor will ge generated by sampling using the supplied random `generator`.
-    prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-        provided, text embeddings will be generated from `prompt` input argument.
-    negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-        argument.
-    output_type (`str`, defaults to `"pil"`):
-        The output format of the generate image. Choose between
-        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-    return_dict (`bool`, defaults to `True`):
-        Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-        plain tuple.
-    callback (Optional[Callable], defaults to `None`):
-        A function that will be called every `callback_steps` steps during inference. The function will be
-        called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-    callback_steps (`int`, defaults to 1):
-        The frequency at which the `callback` function will be called. If not specified, the callback will be
-        called at every step.
-    guidance_rescale (`float`, defaults to 0.7):
-        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-        Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-        [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-        Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-Returns:
-    [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-    [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-    When returning a tuple, the first element is a list with the generated images, and the second element is a
-    list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-    (nsfw) content, according to the `safety_checker`.
-"""
-
-
 class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin):
     # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
     def _encode_prompt(
@@ -365,6 +292,77 @@ def __call__(
         aesthetic_score: float = 6.0,
         negative_aesthetic_score: float = 2.5,
     ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`Union[np.ndarray, PIL.Image.Image]`):
+                `Image`, or tensor representing an image batch which will be upscaled.
+            strength (`float`, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, defaults to 5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`Optional[Union[str, list]]`):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
+                A np.random.RandomState to make generation deterministic.
+            latents (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (Optional[Callable], defaults to `None`):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            guidance_rescale (`float`, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
         # 0. Check inputs. Raise error if not correct
         self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)