update

huggingface · Jan 9, 2024 · 8b1c101 · 8b1c101
1 parent 22d4565
commit 8b1c101
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 66 deletions.
diff --git a/optimum/exporters/neuron/__init__.py b/optimum/exporters/neuron/__init__.py
@@ -15,6 +15,7 @@
 
 from .__main__ import (
     infer_stable_diffusion_shapes_from_diffusers,
+    infer_stable_video_diffusion_shapes_from_diffusers,
     main_export,
     normalize_input_shapes,
 )

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -21,13 +21,13 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
-from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, PretrainedConfig
 
 from ...neuron.utils import (
     DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_NAME,
+    DIFFUSION_MODEL_IMAGE_ENCODER_NAME,
     DIFFUSION_MODEL_UNET_NAME,
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
@@ -44,6 +44,9 @@
 from .convert import export_models, validate_models_outputs
 from .model_configs import *  # noqa: F403
 from .utils import (
+    infer_task,
+    infer_stable_diffusion_shapes_from_diffusers,
+    infer_stable_video_diffusion_shapes_from_diffusers,
     build_stable_diffusion_components_mandatory_shapes,
     build_stable_video_diffusion_components_mandatory_shapes,
     get_encoder_decoder_models_for_export,
@@ -63,15 +66,12 @@
 
     NEURON_COMPILER = "Neuronx"
 
-if is_diffusers_available():
-    from diffusers import StableDiffusionXLPipeline
-
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel
 
     if is_diffusers_available():
-        from diffusers import DiffusionPipeline, StableDiffusionPipeline
+        from diffusers import DiffusionPipeline
 
 
 logger = logging.get_logger()
@@ -91,22 +91,6 @@ def infer_compiler_kwargs(args: argparse.Namespace) -> Dict[str, Any]:
     return compiler_kwargs
 
 
-def infer_task(task: str, model_name_or_path: str) -> str:
-    if task == "auto":
-        try:
-            task = TasksManager.infer_task_from_model(model_name_or_path)
-        except KeyError as e:
-            raise KeyError(
-                "The task could not be automatically inferred. Please provide the argument --task with the task "
-                f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-        except RequestsConnectionError as e:
-            raise RequestsConnectionError(
-                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
-            )
-    return task
-
-
 def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]:
     """
     Customize optional outputs of the traced model, eg. if `output_attentions=True`, the attentions tensors will be traced.
@@ -207,44 +191,6 @@ def _normalize_stable_video_diffusion_input_shapes(
     return input_shapes
 
 
-def infer_stable_diffusion_shapes_from_diffusers(
-    input_shapes: Dict[str, Dict[str, int]],
-    model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
-):
-    if model.tokenizer is not None:
-        sequence_length = model.tokenizer.model_max_length
-    elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
-        sequence_length = model.tokenizer_2.model_max_length
-    else:
-        raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.")
-    unet_num_channels = model.unet.config.in_channels
-    vae_encoder_num_channels = model.vae.config.in_channels
-    vae_decoder_num_channels = model.vae.config.latent_channels
-    vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
-    height = input_shapes["unet_input_shapes"]["height"]
-    scaled_height = height // vae_scale_factor
-    width = input_shapes["unet_input_shapes"]["width"]
-    scaled_width = width // vae_scale_factor
-
-    input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
-    input_shapes["unet_input_shapes"].update(
-        {
-            "sequence_length": sequence_length,
-            "num_channels": unet_num_channels,
-            "height": scaled_height,
-            "width": scaled_width,
-        }
-    )
-    input_shapes["vae_encoder_input_shapes"].update(
-        {"num_channels": vae_encoder_num_channels, "height": height, "width": width}
-    )
-    input_shapes["vae_decoder_input_shapes"].update(
-        {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
-    )
-
-    return input_shapes
-
-
 def _get_submodels_and_neuron_configs(
     model: Union["PreTrainedModel", "DiffusionPipeline"],
     input_shapes: Dict[str, int],
@@ -261,7 +207,16 @@ def _get_submodels_and_neuron_configs(
         getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False
     )
 
-    if is_stable_diffusion:
+    if task == "stable-video-diffusion":
+        # TODO: Enable optional outputs for Stable Video Diffusion
+        if output_attentions or output_hidden_states:
+            raise ValueError(
+                f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
+            )
+        models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_video_diffusion(
+            model, input_shapes, task, output, dynamic_batch_size,
+        )
+    elif is_stable_diffusion:
         # TODO: Enable optional outputs for Stable Diffusion
         if output_attentions or output_hidden_states:
             raise ValueError(
@@ -351,6 +306,45 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     return models_and_neuron_configs, output_model_names
 
 
+def _get_submodels_and_neuron_configs_for_stable_video_diffusion(
+    model: Union["PreTrainedModel", "DiffusionPipeline"],
+    input_shapes: Dict[str, int],
+    task: str,
+    output: Path,
+    dynamic_batch_size: bool = False,
+):
+    if is_neuron_available():
+        raise RuntimeError(
+            "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
+        )
+    input_shapes = infer_stable_video_diffusion_shapes_from_diffusers(input_shapes, model)
+
+    # Saving the model config and preprocessor as this is needed sometimes.
+    model.scheduler.save_pretrained(output.joinpath("scheduler"))
+    if getattr(model, "feature_extractor", None) is not None:
+        model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
+    model.save_config(output)
+
+    models_and_neuron_configs = get_stable_diffusion_models_for_export(
+        pipeline=model,
+        task=task,
+        dynamic_batch_size=dynamic_batch_size,
+        **input_shapes,
+    )
+    output_model_names = {
+        DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
+        DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
+        DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
+    }
+    if getattr(model, "image_encoder", None) is not None:
+        output_model_names[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = os.path.join(
+            DIFFUSION_MODEL_IMAGE_ENCODER_NAME, NEURON_FILE_NAME
+        )
+    del model
+
+    return models_and_neuron_configs, output_model_names
+
+
 def _get_submodels_and_neuron_configs_for_encoder_decoder(
     model: "PreTrainedModel",
     input_shapes: Dict[str, int],

diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
@@ -18,6 +18,7 @@
 import os
 from collections import OrderedDict
 from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+from requests.exceptions import ConnectionError as RequestsConnectionError
 
 import torch
 from transformers import PretrainedConfig
@@ -26,6 +27,7 @@
     DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_NAME,
+    DIFFUSION_MODEL_IMAGE_ENCODER_NAME,
     DIFFUSION_MODEL_UNET_NAME,
     DIFFUSION_MODEL_VAE_DECODER_NAME,
     DIFFUSION_MODEL_VAE_ENCODER_NAME,
@@ -52,7 +54,11 @@
             f"We found an older version of diffusers {_diffusers_version} but we require diffusers to be >= {DIFFUSERS_MINIMUM_VERSION}. "
             "Please update diffusers by running `pip install --upgrade diffusers`"
         )
-    from diffusers import UNet2DConditionModel
+    from diffusers import (
+        UNet2DConditionModel, 
+        StableDiffusionXLPipeline,
+        StableVideoDiffusionPipeline,
+    )
     from diffusers.models.attention_processor import (
         Attention,
         AttnAddedKVProcessor,
@@ -86,6 +92,96 @@ def to_dict(self):
         return output
 
 
+def infer_task(task: str, model_name_or_path: str) -> str:
+    if task == "auto":
+        try:
+            task = TasksManager.infer_task_from_model(model_name_or_path)
+        except KeyError as e:
+            raise KeyError(
+                "The task could not be automatically inferred. Please provide the argument --task with the task "
+                f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+        except RequestsConnectionError as e:
+            raise RequestsConnectionError(
+                f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
+            )
+    return task
+
+
+def infer_stable_diffusion_shapes_from_diffusers(
+    input_shapes: Dict[str, Dict[str, int]],
+    model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
+):
+    if getattr(model, "tokenizer", None):
+        sequence_length = model.tokenizer.model_max_length
+    elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
+        sequence_length = model.tokenizer_2.model_max_length
+    else:
+        raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.")
+    unet_num_channels = model.unet.config.in_channels
+    vae_encoder_num_channels = model.vae.config.in_channels
+    vae_decoder_num_channels = model.vae.config.latent_channels
+    vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
+    height = input_shapes["unet_input_shapes"]["height"]
+    scaled_height = height // vae_scale_factor
+    width = input_shapes["unet_input_shapes"]["width"]
+    scaled_width = width // vae_scale_factor
+
+    input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
+    input_shapes["unet_input_shapes"].update(
+        {
+            "sequence_length": sequence_length,
+            "num_channels": unet_num_channels,
+            "height": scaled_height,
+            "width": scaled_width,
+        }
+    )
+    input_shapes["vae_encoder_input_shapes"].update(
+        {"num_channels": vae_encoder_num_channels, "height": height, "width": width}
+    )
+    input_shapes["vae_decoder_input_shapes"].update(
+        {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
+    )
+
+    return input_shapes
+
+
+def infer_stable_video_diffusion_shapes_from_diffusers(
+    input_shapes: Dict[str, Dict[str, int]],
+    model: "StableVideoDiffusionPipeline",
+):
+    image_encoder_num_channels = model.image_encoder.config.num_channels
+    unet_num_channels = model.unet.config.in_channels
+    vae_encoder_num_channels = model.vae.config.in_channels
+    vae_decoder_num_channels = model.vae.config.latent_channels
+    vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
+    height = input_shapes["unet_input_shapes"]["height"]
+    scaled_height = height // vae_scale_factor
+    width = input_shapes["unet_input_shapes"]["width"]
+    scaled_width = width // vae_scale_factor
+    import pdb
+    pdb.set_trace()
+
+    input_shapes["image_encoder_input_shapes"].update({"num_channels": image_encoder_num_channels})
+    input_shapes["unet_input_shapes"].update(
+        {"num_channels": unet_num_channels, "height": scaled_height, "width": scaled_width}
+    )
+    input_shapes["vae_encoder_input_shapes"].update(
+        {"num_channels": vae_encoder_num_channels, "height": height, "width": width}
+    )
+    input_shapes["vae_decoder_input_shapes"].update(
+        {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
+    )
+
+    default_num_frames = model.unet.config.num_frames
+    if input_shapes["unet_input_shapes"]["num_frames"] is None:
+        input_shapes["unet_input_shapes"]["num_frames"] = default_num_frames
+        input_shapes["vae_decoder_input_shapes"]["num_frames"] = default_num_frames
+
+
+    return input_shapes
+
+
 def build_stable_diffusion_components_mandatory_shapes(
     batch_size: Optional[int] = None,
     sequence_length: Optional[int] = None,
@@ -177,10 +273,11 @@ def build_stable_video_diffusion_components_mandatory_shapes(
 def get_stable_diffusion_models_for_export(
     pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
     task: str,
-    text_encoder_input_shapes: Dict[str, int],
     unet_input_shapes: Dict[str, int],
     vae_encoder_input_shapes: Dict[str, int],
     vae_decoder_input_shapes: Dict[str, int],
+    text_encoder_input_shapes: Optional[Dict[str, int]] = None,
+    image_encoder_input_shapes: Optional[Dict[str, int]] = None,
     dynamic_batch_size: Optional[bool] = False,
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "NeuronConfig"]]:
     """
@@ -194,14 +291,16 @@ def get_stable_diffusion_models_for_export(
             The model to export.
         task (`str`):
             Task name, should be either "stable-diffusion" or "stable-diffusion-xl".
-        text_encoder_input_shapes (`Dict[str, int]`):
-            Static shapes used for compiling text encoder.
         unet_input_shapes (`Dict[str, int]`):
             Static shapes used for compiling unet.
         vae_encoder_input_shapes (`Dict[str, int]`):
             Static shapes used for compiling vae encoder.
         vae_decoder_input_shapes (`Dict[str, int]`):
             Static shapes used for compiling vae decoder.
+        text_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`):
+            Static shapes used for compiling text encoder.
+        image_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`):
+            Static shapes used for compiling image encoder.
         dynamic_batch_size (`bool`, defaults to `False`):
             Whether the Neuron compiled model supports dynamic batch size.
 
@@ -240,6 +339,22 @@ def get_stable_diffusion_models_for_export(
             **text_encoder_input_shapes,
         )
         models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2)
+
+    # Image encoder
+    if DIFFUSION_MODEL_IMAGE_ENCODER_NAME in models_for_export:
+        image_encoder = models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME]
+        image_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=image_encoder, exporter="neuron", task="feature-extraction"
+        )
+        import pdb
+        pdb.set_trace()
+        image_encoder_neuron_config = image_encoder_config_constructor(
+            image_encoder.config,
+            task="feature-extraction",
+            dynamic_batch_size=dynamic_batch_size,
+            **image_encoder_input_shapes,
+        )
+        models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = (image_encoder, image_encoder_neuron_config)
 
     # U-NET
     unet = models_for_export[DIFFUSION_MODEL_UNET_NAME]
@@ -292,7 +407,7 @@ def get_stable_diffusion_models_for_export(
 
 
 def _get_submodels_for_export_stable_diffusion(
-    pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
+    pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline", "StableVideoDiffusionPipeline"],
     task: str,
 ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
     """
@@ -307,7 +422,8 @@ def _get_submodels_for_export_stable_diffusion(
         projection_dim = pipeline.text_encoder.config.projection_dim
 
     # Text encoders
-    if pipeline.text_encoder is not None:
+    text_encoder = getattr(pipeline, "text_encoder", None)
+    if text_encoder is not None:
         if is_sdxl:
             pipeline.text_encoder.config.output_hidden_states = True
         models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_NAME, copy.deepcopy(pipeline.text_encoder)))
@@ -316,6 +432,12 @@ def _get_submodels_for_export_stable_diffusion(
     if text_encoder_2 is not None:
         text_encoder_2.config.output_hidden_states = True
         models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2)))
+
+    # Image encoder
+    image_encoder = getattr(pipeline, "image_encoder", None)
+    if image_encoder is not None:
+        image_encoder.config.output_hidden_states = True
+        models_for_export.append((DIFFUSION_MODEL_IMAGE_ENCODER_NAME, copy.deepcopy(image_encoder)))
 
     # U-NET
     pipeline.unet.set_attn_processor(AttnProcessor())