huggingface · JingyaHuang · Oct 6, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 16, 2023
diff --git a/docs/source/guides/models.mdx b/docs/source/guides/models.mdx
@@ -67,6 +67,12 @@ And the next time when you want to run inference, just load your compiled model
 As you see, there is no need to pass the neuron arguments used during the export as they are
 saved in a `config.json` file, and will be restored automatically by `NeuronModelForXXX` class.
 
+<Tip>
+
+When running inference for the first time, there is a warmup phase which would take 3x-4x latency than a regular run.
+
+</Tip>
+
 ## Discriminative NLP models
 
 As explained in the previous section, you will need only few modifications to your Transformers code to export and run NLP models:
@@ -282,6 +288,7 @@ prompt = "ghibli style, a fantasy landscape with snowcapped mountains, trees, la
 image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
 image.save("fantasy_landscape.png")
 ```
+
 `image`          | `prompt` | output |
 :-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
 <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/03-sd-img2img-init.png" alt="landscape photo" width="256" height="256"/> | ***ghibli style, a fantasy landscape with snowcapped mountains, trees, lake with detailed reflection. warm colors, 8K*** | <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/04-sd-img2img.png" alt="drawing" width="250"/> |
@@ -322,6 +329,8 @@ image.save("cat_on_bench.png")
 
 ## Stable Diffusion XL
 
+### Text-to-Image
+
 Similar to Stable Diffusion, you will be able to use `NeuronStableDiffusionXLPipeline` API to export and run inference on Neuron devices with SDXL models. 
 
 ```python
@@ -341,7 +350,7 @@ Similar to Stable Diffusion, you will be able to use `NeuronStableDiffusionXLPip
 ... )
 ```
 
-Now generate an image with a prompt on neuron:
+Now generate an image with a text prompt on neuron:
 
 ```python
 >>> prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
@@ -355,5 +364,112 @@ Now generate an image with a prompt on neuron:
   alt="sdxl generated image"
 />
 
+### Image-to-Image
+
+With `NeuronStableDiffusionXLImg2ImgPipeline`, you can pass an initial image, and a text prompt to condition gnerated images:
+
+```python
+from optimum.neuron import NeuronStableDiffusionXLImg2ImgPipeline
+from diffusers.utils import load_image
+
+prompt = "a dog running, lake, moat"
+url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png"
+init_image = load_image(url).convert("RGB")
+
+pipe = NeuronStableDiffusionXLImg2ImgPipeline.from_pretrained("sd_neuron_xl/", device_ids=[0, 1])
+image = pipe(prompt=prompt, image=init_image).images[0]
+```
+
+`image`          | `prompt` | output |
+:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
+<img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png" alt="castle photo" width="256" height="256"/> | ***a dog running, lake, moat*** | <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/06-sdxl-img2img.png" alt="castle with dog" width="250"/> |
+
+### Inpaint
+
+With `NeuronStableDiffusionXLInpaintPipeline`, pass the original image and a mask of what you want to replace in the original image. Then replace the masked area with content described in a prompt.
+
+```python
+from optimum.neuron import NeuronStableDiffusionXLInpaintPipeline
+from diffusers.utils import load_image
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
+mask_url = (
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-inpaint-mask.png"
+)
+
+init_image = load_image(img_url).convert("RGB")
+mask_image = load_image(mask_url).convert("RGB")
+prompt = "A deep sea diver floating"
+
+pipe = NeuronStableDiffusionXLInpaintPipeline.from_pretrained("sd_neuron_xl/", device_ids=[0, 1])
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.85, guidance_scale=12.5).images[0]
+```
+
+`image`          | `mask_image` | `prompt` | output |
+:-------------------------:|:-------------------------:|:-------------------------:|-------------------------:|
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png" alt="drawing" width="250"/> | <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-inpaint-mask.png" alt="drawing" width="250"/> | ***A deep sea diver floating*** | <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/07-sdxl-inpaint.png" alt="drawing" width="250"/> |
+
+### Refine Image Quality
+
+SDXL includes a [refiner model](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0) to denoise low-noise stage images generated from the base model. There are two ways to use the refiner:
+
+1. use the base and refiner model together to produce a refined image.
+2. use the base model to produce an image, and subsequently use the refiner model to add more details to the image.
+
+#### Base + refiner model
+
+```python
+from optimum.neuron import NeuronStableDiffusionXLPipeline, NeuronStableDiffusionXLImg2ImgPipeline
+
+prompt = "A majestic lion jumping from a big stone at night"
+base = NeuronStableDiffusionXLPipeline.from_pretrained("sd_neuron_xl/", device_ids=[0, 1])
+image = base(
+    prompt=prompt,
+    num_images_per_prompt=num_images_per_prompt,
+    num_inference_steps=40,
+    denoising_end=0.8,
+    output_type="latent",
+).images[0]
+del base  # To avoid neuron device OOM
+
+refiner = NeuronStableDiffusionXLImg2ImgPipeline.from_pretrained("sd_neuron_xl_refiner/", device_ids=[0, 1])
+image = image = refiner(
+    prompt=prompt,
+    num_inference_steps=40,
+    denoising_start=0.8,
+    image=image,
+).images[0]
+```
+
+<img
+  src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/08-sdxl-base-refine.png"
+  width="256" 
+  height="256"
+  alt="sdxl base + refiner"
+/>
+
+#### Base to refiner model
+
+```python
+from optimum.neuron import NeuronStableDiffusionXLPipeline, NeuronStableDiffusionXLImg2ImgPipeline
+
+prompt = "A majestic lion jumping from a big stone at night"
+base = NeuronStableDiffusionXLPipeline.from_pretrained("sd_neuron_xl/", device_ids=[0, 1])
+image = base(prompt=prompt, output_type="latent").images[0]
+del base  # To avoid neuron device OOM
+
+refiner = NeuronStableDiffusionXLImg2ImgPipeline.from_pretrained("sd_neuron_xl_refiner/", device_ids=[0, 1])
+image = refiner(prompt=prompt, image=image[None, :]).images[0]
+```
+
+`Base Image`         | Refined Image |
+:-------------------------:|-------------------------:|
+<img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/09-sdxl-base-full.png" alt="drawing" width="250"/> | <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/010-sdxl-refiner-detailed.png" alt="drawing" width="250"/> |
+
+<Tip>
+
+To avoid Neuron device out of memory, it's suggested to finish all base inference and release the device memory before running the refiner. 
+
+</Tip>
 
 Happy inference with Neuron! 🚀
diff --git a/docs/source/package_reference/modeling.mdx b/docs/source/package_reference/modeling.mdx
@@ -87,4 +87,12 @@ The following Neuron model classes are available for natural language processing
 ### NeuronStableDiffusionXLPipeline
 
 [[autodoc]] modeling_diffusion.NeuronStableDiffusionXLPipeline
+    - __call__
+
+### NeuronStableDiffusionXLImg2ImgPipeline
+[[autodoc]] modeling_diffusion.NeuronStableDiffusionXLImg2ImgPipeline
+    - __call__
+
+### NeuronStableDiffusionXLInpaintPipeline
+[[autodoc]] modeling_diffusion.NeuronStableDiffusionXLInpaintPipeline
     - __call__
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -138,7 +138,12 @@ def infer_stable_diffusion_shapes_from_diffusers(
     input_shapes: Dict[str, Dict[str, int]],
     model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
 ):
-    sequence_length = model.tokenizer.model_max_length
+    if model.tokenizer is not None:
+        sequence_length = model.tokenizer.model_max_length
+    elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
+        sequence_length = model.tokenizer_2.model_max_length
+    else:
+        raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.")
     unet_num_channels = model.unet.config.in_channels
     vae_encoder_num_channels = model.vae.config.in_channels
     vae_decoder_num_channels = model.vae.config.latent_channels
@@ -227,8 +232,9 @@ def main_export(
 
         # Saving the model config and preprocessor as this is needed sometimes.
         model.scheduler.save_pretrained(output.joinpath("scheduler"))
-        model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
-        if hasattr(model, "tokenizer_2"):
+        if hasattr(model, "tokenizer") and model.tokenizer is not None:
+            model.tokenizer.save_pretrained(output.joinpath("tokenizer"))
+        if hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
             model.tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
         if hasattr(model, "feature_extractor"):
             model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
@@ -241,12 +247,15 @@ def main_export(
             **input_shapes,
         )
         output_model_names = {
-            DIFFUSION_MODEL_TEXT_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME),
             DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
             DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
             DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
         }
-        if hasattr(model, "text_encoder_2"):
+        if hasattr(model, "text_encoder") and model.text_encoder is not None:
+            output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_NAME] = os.path.join(
+                DIFFUSION_MODEL_TEXT_ENCODER_NAME, NEURON_FILE_NAME
+            )
+        if hasattr(model, "text_encoder_2") and model.text_encoder_2 is not None:
             output_model_names[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = os.path.join(
                 DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, NEURON_FILE_NAME
             )

diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
@@ -19,6 +19,7 @@
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
 
+import torch
 from transformers import PretrainedConfig
 
 from ...neuron.utils import (
@@ -282,6 +283,9 @@ def _get_submodels_for_export_stable_diffusion(
         )
     models_for_export.append((DIFFUSION_MODEL_UNET_NAME, copy.deepcopy(pipeline.unet)))
 
+    if pipeline.vae.config.get("force_upcast", None) is True:
+        pipeline.vae.to(dtype=torch.float32)
+
     # VAE Encoder
     vae_encoder = copy.deepcopy(pipeline.vae)
     vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()}

diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
@@ -37,6 +37,8 @@
         "NeuronStableDiffusionImg2ImgPipeline",
         "NeuronStableDiffusionInpaintPipeline",
         "NeuronStableDiffusionXLPipeline",
+        "NeuronStableDiffusionXLImg2ImgPipeline",
+        "NeuronStableDiffusionXLInpaintPipeline",
     ],
     "modeling_decoder": ["NeuronDecoderModel"],
     "accelerate": [
@@ -65,6 +67,8 @@
         NeuronStableDiffusionImg2ImgPipeline,
         NeuronStableDiffusionInpaintPipeline,
         NeuronStableDiffusionPipeline,
+        NeuronStableDiffusionXLImg2ImgPipeline,
+        NeuronStableDiffusionXLInpaintPipeline,
         NeuronStableDiffusionXLPipeline,
     )
     from .pipelines import pipeline