From e0d6c958dc3a5ce3c67b46d10d7f5224557a346d Mon Sep 17 00:00:00 2001 From: Linoy Tsaban <57615435+linoytsaban@users.noreply.github.com> Date: Tue, 3 Sep 2024 09:54:09 +0300 Subject: [PATCH] improve image-to-image task page (#867) some changes to improve clarity of task description, and general updates to improve task page --------- Co-authored-by: Pedro Cuenca Co-authored-by: Merve Noyan Co-authored-by: Omar Sanseviero --- .../tasks/src/tasks/image-to-image/about.md | 91 ++++++++++++++----- .../tasks/src/tasks/image-to-image/data.ts | 2 +- 2 files changed, 71 insertions(+), 22 deletions(-) diff --git a/packages/tasks/src/tasks/image-to-image/about.md b/packages/tasks/src/tasks/image-to-image/about.md index 63f490f82..3750b34e5 100644 --- a/packages/tasks/src/tasks/image-to-image/about.md +++ b/packages/tasks/src/tasks/image-to-image/about.md @@ -1,15 +1,10 @@ -## Use Cases - -### Style transfer +Image-to-image pipelines can also be used in text-to-image tasks, to provide visual guidance to the text-guided generation process. -One of the most popular use cases of image-to-image is style transfer. Style transfer models can convert a normal photography into a painting in the style of a famous painter. - -## Task Variants +## Use Cases ### Image inpainting -Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor -dust. +Image inpainting is widely used during photography editing to remove unwanted objects, such as poles, wires, or sensor dust. ### Image colorization @@ -24,18 +19,27 @@ Super-resolution models increase the resolution of an image, allowing for higher You can use pipelines for image-to-image in 🧨diffusers library to easily use image-to-image models. See an example for `StableDiffusionImg2ImgPipeline` below. ```python -from PIL import Image -from diffusers import StableDiffusionImg2ImgPipeline +import torch +from diffusers import AutoPipelineForImage2Image +from diffusers.utils import make_image_grid, load_image -model_id_or_path = "runwayml/stable-diffusion-v1-5" -pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) -pipe = pipe.to(cuda) +pipeline = AutoPipelineForImage2Image.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +) -init_image = Image.open("mountains_image.jpeg").convert("RGB").resize((768, 512)) -prompt = "A fantasy landscape, trending on artstation" +# this helps us to reduce memory usage- since SDXL is a bit heavy, this could help by +# offloading the model to CPU w/o hurting performance. +pipeline.enable_model_cpu_offload() -images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images -images[0].save("fantasy_landscape.png") +# prepare image +url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png" +init_image = load_image(url) + +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" + +# pass prompt and image to pipeline +image = pipeline(prompt, image=init_image, strength=0.5).images[0] +make_image_grid([init_image, image], rows=1, cols=2) ``` You can use [huggingface.js](https://github.com/huggingface/huggingface.js) to infer image-to-image models on Hugging Face Hub. @@ -53,13 +57,53 @@ await inference.imageToImage({ }); ``` -## ControlNet +## Uses Cases for Text Guided Image Generation -Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image. +### Style Transfer + +One of the most popular use cases of image-to-image is style transfer. With style transfer models: -Many ControlNet models were trained in our community event, JAX Diffusers sprint. You can see the full list of the ControlNet models available [here](https://huggingface.co/spaces/jax-diffusers-event/leaderboard). +- a regular photo can be transformed into a variety of artistic styles or genres, such as a watercolor painting, a comic book illustration and more. +- new images can be generated using a text prompt, in the style of a reference input image. + +See 🧨diffusers example for style transfer with `AutoPipelineForText2Image` below. + +```python +from diffusers import AutoPipelineForText2Image +from diffusers.utils import load_image +import torch + +# load pipeline +pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda") +pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin") + +# set the adapter and scales - this is a component that lets us add the style control from an image to the text-to-image model +scale = { + "down": {"block_2": [0.0, 1.0]}, + "up": {"block_0": [0.0, 1.0, 0.0]}, +} +pipeline.set_ip_adapter_scale(scale) + +style_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg") + +generator = torch.Generator(device="cpu").manual_seed(26) +image = pipeline( + prompt="a cat, masterpiece, best quality, high quality", + ip_adapter_image=style_image, + negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", + guidance_scale=5, + num_inference_steps=30, + generator=generator, +).images[0] +image +``` + +### ControlNet + +Controlling the outputs of diffusion models only with a text prompt is a challenging problem. ControlNet is a neural network model that provides image-based control to diffusion models. Control images can be edges or other landmarks extracted from a source image. +![Examples](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/12-sdxl-text2img-controlnet.png) -## Most Used Model for the Task +## Pix2Pix Pix2Pix is a popular model used for image-to-image translation tasks. It is based on a conditional-GAN (generative adversarial network) where instead of a noise vector a 2D image is given as input. More information about Pix2Pix can be retrieved from this [link](https://phillipi.github.io/pix2pix/) where the associated paper and the GitHub repository can be found. @@ -70,8 +114,13 @@ The images below show some examples extracted from the Pix2Pix paper. This model ## Useful Resources - [Image-to-image guide with diffusers](https://huggingface.co/docs/diffusers/using-diffusers/img2img) +- Image inpainting: [inpainting with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/inpaint), [demo](https://huggingface.co/spaces/diffusers/stable-diffusion-xl-inpainting) +- Colorization: [demo](https://huggingface.co/spaces/modelscope/old_photo_restoration) +- Super resolution: [image upscaling with 🧨diffusers](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/upscale#super-resolution), [demo](https://huggingface.co/spaces/radames/Enhance-This-HiDiffusion-SDXL) +- [Style transfer and layout control with diffusers 🧨](https://huggingface.co/docs/diffusers/main/en/using-diffusers/ip_adapter#style--layout-control) - [Train your ControlNet with diffusers 🧨](https://huggingface.co/blog/train-your-controlnet) - [Ultra fast ControlNet with 🧨 Diffusers](https://huggingface.co/blog/controlnet) +- [List of ControlNets trained in the community JAX Diffusers sprint](https://huggingface.co/spaces/jax-diffusers-event/leaderboard) ## References diff --git a/packages/tasks/src/tasks/image-to-image/data.ts b/packages/tasks/src/tasks/image-to-image/data.ts index 99e91557a..65200fd92 100644 --- a/packages/tasks/src/tasks/image-to-image/data.ts +++ b/packages/tasks/src/tasks/image-to-image/data.ts @@ -93,7 +93,7 @@ const taskData: TaskDataCustom = { }, ], summary: - "Image-to-image is the task of transforming a source image to match the characteristics of a target image or a target image domain. Any image manipulation and enhancement is possible with image to image models.", + "Image-to-image is the task of transforming an input image through a variety of possible manipulations and enhancements, such as super-resolution, image inpainting, colorization, and more.", widgetModels: ["lllyasviel/sd-controlnet-canny"], youtubeId: "", };