huggingface · echarlaix · Sep 5, 2023 · Jul 21, 2023 · Aug 30, 2023 · Aug 31, 2023
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
@@ -29,6 +29,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        pip install git+https://github.com/huggingface/diffusers
         pip install .[tests,onnxruntime]
     - name: Test with pytest
       working-directory: tests

diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
@@ -31,7 +31,7 @@
     StableDiffusionXLImg2ImgPipeline,
 )
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME
+from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
@@ -45,6 +45,7 @@
 from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
 from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
 from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
+from ..pipelines.diffusers.pipeline_utils import OptimumVaeImageProcessor
 from ..utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
@@ -171,6 +172,8 @@ def __init__(
         else:
             self.vae_scale_factor = 8
 
+        self.image_processor = OptimumVaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
     @staticmethod
     def load_model(
         vae_decoder_path: Union[str, Path],
@@ -578,6 +581,7 @@ def __init__(
         tokenizer_2: Optional[CLIPTokenizer] = None,
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        add_watermarker: Optional[bool] = None,
     ):
         super().__init__(
             vae_decoder_session=vae_decoder_session,
@@ -593,11 +597,14 @@ def __init__(
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
         )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
 
-        # additional invisible-watermark dependency for SD XL
-        from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker
+        if add_watermarker:
+            from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker
 
-        self.watermark = StableDiffusionXLWatermarker()
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
@@ -380,32 +380,33 @@ def __call__(
             image = latents
             has_nsfw_concept = None
         else:
-            latents = 1 / self.vae_decoder.config.get("scaling_factor", 0.18215) * latents
+            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
             # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
             image = np.concatenate(
                 [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
             )
-            # TODO: add image_processor
-            image = np.clip(image / 2 + 0.5, 0, 1)
-            image = image.transpose((0, 2, 3, 1))
             image, has_nsfw_concept = self.run_safety_checker(image)
 
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)
 
         return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
 
-    def run_safety_checker(self, image):
+    def run_safety_checker(self, image: np.ndarray):
         if self.safety_checker is None:
             has_nsfw_concept = None
         else:
+            feature_extractor_input = self.image_processor.numpy_to_pil(image)
             safety_checker_input = self.feature_extractor(
-                self.numpy_to_pil(image), return_tensors="np"
+                feature_extractor_input, return_tensors="np"
             ).pixel_values.astype(image.dtype)
-
             images, has_nsfw_concept = [], []
             for i in range(image.shape[0]):
                 image_i, has_nsfw_concept_i = self.safety_checker(

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 
 import inspect
-import logging
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -23,10 +22,6 @@
 from diffusers.utils import deprecate
 
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-from .pipeline_utils import preprocess
-
-
-logger = logging.getLogger(__name__)
 
 
 class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin):
@@ -178,7 +173,7 @@ def __call__(
         # set timesteps
         self.scheduler.set_timesteps(num_inference_steps)
 
-        image = preprocess(image)
+        image = self.image_processor.preprocess(image)
 
         # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
         # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
@@ -287,17 +282,19 @@ def __call__(
             image = latents
             has_nsfw_concept = None
         else:
-            latents = 1 / scaling_factor * latents
+            latents /= scaling_factor
             # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
             image = np.concatenate(
                 [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
             )
-            image = np.clip(image / 2 + 0.5, 0, 1)
-            image = image.transpose((0, 2, 3, 1))
             image, has_nsfw_concept = self.run_safety_checker(image)
 
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
@@ -13,7 +13,6 @@
 #  limitations under the License.
 
 import inspect
-import logging
 from typing import Callable, List, Optional, Union
 
 import numpy as np
@@ -25,9 +24,6 @@
 from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
 
 
-logger = logging.getLogger(__name__)
-
-
 def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor):
     image = np.array(
         image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor))
@@ -329,17 +325,19 @@ def __call__(
             image = latents
             has_nsfw_concept = None
         else:
-            latents = 1 / scaling_factor * latents
+            latents /= scaling_factor
             # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
             image = np.concatenate(
                 [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
             )
-            image = np.clip(image / 2 + 0.5, 0, 1)
-            image = image.transpose((0, 2, 3, 1))
             image, has_nsfw_concept = self.run_safety_checker(image)
 
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
 
         if not return_dict:
             return (image, has_nsfw_concept)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
@@ -480,18 +480,15 @@ def __call__(
         if output_type == "latent":
             image = latents
         else:
-            latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215)
+            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
             # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
             image = np.concatenate(
                 [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
             )
-            image = self.watermark.apply_watermark(image)
-
-            # TODO: add image_processor
-            image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1))
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
 
         if not return_dict:
             return (image,)

diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
@@ -21,7 +21,7 @@
 import torch
 from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
 
-from .pipeline_utils import DiffusionPipelineMixin, preprocess, rescale_noise_cfg
+from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
 
 
 logger = logging.getLogger(__name__)
@@ -400,7 +400,7 @@ def __call__(
         )
 
         # 3. Preprocess image
-        image = preprocess(image)
+        image = self.image_processor.preprocess(image)
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps)
@@ -487,18 +487,15 @@ def __call__(
         if output_type == "latent":
             image = latents
         else:
-            latents = latents / self.vae_decoder.config.get("scaling_factor", 0.18215)
+            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
             # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
             image = np.concatenate(
                 [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
             )
-            image = self.watermark.apply_watermark(image)
-
-            # TODO: add image_processor
-            image = np.clip(image / 2 + 0.5, 0, 1).transpose((0, 2, 3, 1))
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
 
         if not return_dict:
             return (image,)