From e0d6c958dc3a5ce3c67b46d10d7f5224557a346d Mon Sep 17 00:00:00 2001
From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com>
Date: Tue, 3 Sep 2024 09:54:09 +0300
Subject: [PATCH] improve image-to-image task page (#867)

some changes to improve clarity of task description, and general updates
to improve task page

---------

Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
Co-authored-by: Merve Noyan <merveenoyan@gmail.com>
Co-authored-by: Omar Sanseviero <osanseviero@gmail.com>
---
 .../tasks/src/tasks/image-to-image/about.md   | 91 ++++++++++++++-----
 .../tasks/src/tasks/image-to-image/data.ts    |  2 +-
 2 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/packages/tasks/src/tasks/image-to-image/about.md b/packages/tasks/src/tasks/image-to-image/about.md
index 63f490f82..3750b34e5 100644
--- a/packages/tasks/src/tasks/image-to-image/about.md
+++ b/packages/tasks/src/tasks/image-to-image/about.md
@@ -1,15 +1,10 @@
-## Use Cases
-
-### Style transfer
+Image-to-image pipelines can also be used in text-to-image tasks, to provide visual guidance to the text-guided generation process.
 
-One of the most popular use cases of image-to-image is style transfer. Style transfer models can convert a normal photography into a painting in the style of a famous painter.
-
-## Task Variants
+## Use Cases
 
 ### Image inpainting
 
-Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor
-dust.
+Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor dust.
 
 ### Image colorization
 
@@ -24,18 +19,27 @@ Super-resolution models increase the resolution of an image, allowing for higher
 You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below.
 
 ```python
-from PIL import Image
-from diffusers import StableDiffusionImg2ImgPipeline
+import torch
+from diffusers import AutoPipelineForImage2Image
+from diffusers.utils import make_image_grid, load_image
 
-model_id_or_path = "runwayml/stable-diffusion-v1-5"
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
-pipe = pipe.to(cuda)
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+)
 
-init_image = Image.open("mountains_image.jpeg").convert("RGB").resize((768, 512))
-prompt = "A fantasy landscape, trending on artstation"
+# this helps us to reduce memory usage- since SDXL is a bit heavy, this could help by
+# offloading the model to CPU w/o hurting performance.
+pipeline.enable_model_cpu_offload()
 
-images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
-images[0].save("fantasy_landscape.png")
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"
+init_image = load_image(url)
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, strength=0.5).images[0]
+make_image_grid([init_image, image], rows=1, cols=2)
 ```
 
 You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub.
@@ -53,13 +57,53 @@ await inference.imageToImage({
 });
 ```
 
-## ControlNet
+## Uses Cases for Text Guided Image Generation
 
-Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image.
+### Style Transfer
+
+One of the most popular use cases of image-to-image is style transfer. With style transfer models:
 
-Many ControlNet models were trained in our community event, JAX Diffusers sprint. You can see the full list of the ControlNet models available [here](https://huggingface.co/spaces/jax-diffusers-event/leaderboard).
+- a regular photo can be transformed into a variety of artistic styles or genres, such as a watercolor painting, a comic book illustration and more.
+- new images can be generated using a text prompt, in the style of a reference input image.
+
+See 🧨diffusers example for style transfer with `AutoPipelineForText2Image` below.
+
+```python
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+import torch
+
+# load pipeline
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
+
+# set the adapter and scales - this is a component that lets us add the style control from an image to the text-to-image model
+scale = {
+    "down": {"block_2": [0.0, 1.0]},
+    "up": {"block_0": [0.0, 1.0, 0.0]},
+}
+pipeline.set_ip_adapter_scale(scale)
+
+style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg")
+
+generator = torch.Generator(device="cpu").manual_seed(26)
+image = pipeline(
+    prompt="a cat, masterpiece, best quality, high quality",
+    ip_adapter_image=style_image,
+    negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+    guidance_scale=5,
+    num_inference_steps=30,
+    generator=generator,
+).images[0]
+image
+```
+
+### ControlNet
+
+Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image.
+![Examples](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/12-sdxl-text2img-controlnet.png)
 
-## Most Used Model for the Task
+## Pix2Pix
 
 Pix2Pix is a popular model used for image-to-image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found.
 
@@ -70,8 +114,13 @@ The images below show some examples extracted from the Pix2Pix paper. This model
 ## Useful Resources
 
 - [Image-to-image guide with diffusers](https://huggingface.co/docs/diffusers/using-diffusers/img2img)
+- Image inpainting: [inpainting with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/inpaint), [demo](https://huggingface.co/spaces/diffusers/stable-diffusion-xl-inpainting)
+- Colorization: [demo](https://huggingface.co/spaces/modelscope/old_photo_restoration)
+- Super resolution: [image upscaling with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/upscale#super-resolution), [demo](https://huggingface.co/spaces/radames/Enhance-This-HiDiffusion-SDXL)
+- [Style transfer and layout control with diffusers 🧨](https://huggingface.co/docs/diffusers/main/en/using-diffusers/ip_adapter#style--layout-control)
 - [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet)
 - [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet)
+- [List of ControlNets trained in the community JAX Diffusers sprint](https://huggingface.co/spaces/jax-diffusers-event/leaderboard)
 
 ## References
 
diff --git a/packages/tasks/src/tasks/image-to-image/data.ts b/packages/tasks/src/tasks/image-to-image/data.ts
index 99e91557a..65200fd92 100644
--- a/packages/tasks/src/tasks/image-to-image/data.ts
+++ b/packages/tasks/src/tasks/image-to-image/data.ts
@@ -93,7 +93,7 @@ const taskData: TaskDataCustom = {
 		},
 	],
 	summary:
-		"Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.",
+		"Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.",
 	widgetModels: ["lllyasviel/sd-controlnet-canny"],
 	youtubeId: "",
 };