diff --git a/optimum/exporters/neuron/__init__.py b/optimum/exporters/neuron/__init__.py index 03dc97aa6..3b01f3c2f 100644 --- a/optimum/exporters/neuron/__init__.py +++ b/optimum/exporters/neuron/__init__.py @@ -15,6 +15,7 @@ from .__main__ import ( infer_stable_diffusion_shapes_from_diffusers, + infer_stable_video_diffusion_shapes_from_diffusers, main_export, normalize_input_shapes, ) diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py index 2901cac27..7ed1dfc37 100644 --- a/optimum/exporters/neuron/__main__.py +++ b/optimum/exporters/neuron/__main__.py @@ -21,13 +21,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional, Union -from requests.exceptions import ConnectionError as RequestsConnectionError from transformers import AutoConfig, PretrainedConfig from ...neuron.utils import ( DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, @@ -44,6 +44,9 @@ from .convert import export_models, validate_models_outputs from .model_configs import * # noqa: F403 from .utils import ( + infer_task, + infer_stable_diffusion_shapes_from_diffusers, + infer_stable_video_diffusion_shapes_from_diffusers, build_stable_diffusion_components_mandatory_shapes, build_stable_video_diffusion_components_mandatory_shapes, get_encoder_decoder_models_for_export, @@ -63,15 +66,12 @@ NEURON_COMPILER = "Neuronx" -if is_diffusers_available(): - from diffusers import StableDiffusionXLPipeline - if TYPE_CHECKING: from transformers import PreTrainedModel if is_diffusers_available(): - from diffusers import DiffusionPipeline, StableDiffusionPipeline + from diffusers import DiffusionPipeline logger = logging.get_logger() @@ -91,22 +91,6 @@ def infer_compiler_kwargs(args: argparse.Namespace) -> Dict[str, Any]: return compiler_kwargs -def infer_task(task: str, model_name_or_path: str) -> str: - if task == "auto": - try: - task = TasksManager.infer_task_from_model(model_name_or_path) - except KeyError as e: - raise KeyError( - "The task could not be automatically inferred. Please provide the argument --task with the task " - f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) - except RequestsConnectionError as e: - raise RequestsConnectionError( - f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" - ) - return task - - def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]: """ Customize optional outputs of the traced model, eg. if `output_attentions=True`, the attentions tensors will be traced. @@ -207,44 +191,6 @@ def _normalize_stable_video_diffusion_input_shapes( return input_shapes -def infer_stable_diffusion_shapes_from_diffusers( - input_shapes: Dict[str, Dict[str, int]], - model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"], -): - if model.tokenizer is not None: - sequence_length = model.tokenizer.model_max_length - elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None: - sequence_length = model.tokenizer_2.model_max_length - else: - raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.") - unet_num_channels = model.unet.config.in_channels - vae_encoder_num_channels = model.vae.config.in_channels - vae_decoder_num_channels = model.vae.config.latent_channels - vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8 - height = input_shapes["unet_input_shapes"]["height"] - scaled_height = height // vae_scale_factor - width = input_shapes["unet_input_shapes"]["width"] - scaled_width = width // vae_scale_factor - - input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length}) - input_shapes["unet_input_shapes"].update( - { - "sequence_length": sequence_length, - "num_channels": unet_num_channels, - "height": scaled_height, - "width": scaled_width, - } - ) - input_shapes["vae_encoder_input_shapes"].update( - {"num_channels": vae_encoder_num_channels, "height": height, "width": width} - ) - input_shapes["vae_decoder_input_shapes"].update( - {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width} - ) - - return input_shapes - - def _get_submodels_and_neuron_configs( model: Union["PreTrainedModel", "DiffusionPipeline"], input_shapes: Dict[str, int], @@ -261,7 +207,16 @@ def _get_submodels_and_neuron_configs( getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False ) - if is_stable_diffusion: + if task == "stable-video-diffusion": + # TODO: Enable optional outputs for Stable Video Diffusion + if output_attentions or output_hidden_states: + raise ValueError( + f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet." + ) + models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_video_diffusion( + model, input_shapes, task, output, dynamic_batch_size, + ) + elif is_stable_diffusion: # TODO: Enable optional outputs for Stable Diffusion if output_attentions or output_hidden_states: raise ValueError( @@ -351,6 +306,45 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion( return models_and_neuron_configs, output_model_names +def _get_submodels_and_neuron_configs_for_stable_video_diffusion( + model: Union["PreTrainedModel", "DiffusionPipeline"], + input_shapes: Dict[str, int], + task: str, + output: Path, + dynamic_batch_size: bool = False, +): + if is_neuron_available(): + raise RuntimeError( + "Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead." + ) + input_shapes = infer_stable_video_diffusion_shapes_from_diffusers(input_shapes, model) + + # Saving the model config and preprocessor as this is needed sometimes. + model.scheduler.save_pretrained(output.joinpath("scheduler")) + if getattr(model, "feature_extractor", None) is not None: + model.feature_extractor.save_pretrained(output.joinpath("feature_extractor")) + model.save_config(output) + + models_and_neuron_configs = get_stable_diffusion_models_for_export( + pipeline=model, + task=task, + dynamic_batch_size=dynamic_batch_size, + **input_shapes, + ) + output_model_names = { + DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME), + DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME), + DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME), + } + if getattr(model, "image_encoder", None) is not None: + output_model_names[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = os.path.join( + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, NEURON_FILE_NAME + ) + del model + + return models_and_neuron_configs, output_model_names + + def _get_submodels_and_neuron_configs_for_encoder_decoder( model: "PreTrainedModel", input_shapes: Dict[str, int], diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py index 696271408..f5cdfd7df 100644 --- a/optimum/exporters/neuron/utils.py +++ b/optimum/exporters/neuron/utils.py @@ -18,6 +18,7 @@ import os from collections import OrderedDict from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from requests.exceptions import ConnectionError as RequestsConnectionError import torch from transformers import PretrainedConfig @@ -26,6 +27,7 @@ DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, @@ -52,7 +54,11 @@ f"We found an older version of diffusers {_diffusers_version} but we require diffusers to be >= {DIFFUSERS_MINIMUM_VERSION}. " "Please update diffusers by running `pip install --upgrade diffusers`" ) - from diffusers import UNet2DConditionModel + from diffusers import ( + UNet2DConditionModel, + StableDiffusionXLPipeline, + StableVideoDiffusionPipeline, + ) from diffusers.models.attention_processor import ( Attention, AttnAddedKVProcessor, @@ -86,6 +92,96 @@ def to_dict(self): return output +def infer_task(task: str, model_name_or_path: str) -> str: + if task == "auto": + try: + task = TasksManager.infer_task_from_model(model_name_or_path) + except KeyError as e: + raise KeyError( + "The task could not be automatically inferred. Please provide the argument --task with the task " + f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) + except RequestsConnectionError as e: + raise RequestsConnectionError( + f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" + ) + return task + + +def infer_stable_diffusion_shapes_from_diffusers( + input_shapes: Dict[str, Dict[str, int]], + model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"], +): + if getattr(model, "tokenizer", None): + sequence_length = model.tokenizer.model_max_length + elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None: + sequence_length = model.tokenizer_2.model_max_length + else: + raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.") + unet_num_channels = model.unet.config.in_channels + vae_encoder_num_channels = model.vae.config.in_channels + vae_decoder_num_channels = model.vae.config.latent_channels + vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8 + height = input_shapes["unet_input_shapes"]["height"] + scaled_height = height // vae_scale_factor + width = input_shapes["unet_input_shapes"]["width"] + scaled_width = width // vae_scale_factor + + input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length}) + input_shapes["unet_input_shapes"].update( + { + "sequence_length": sequence_length, + "num_channels": unet_num_channels, + "height": scaled_height, + "width": scaled_width, + } + ) + input_shapes["vae_encoder_input_shapes"].update( + {"num_channels": vae_encoder_num_channels, "height": height, "width": width} + ) + input_shapes["vae_decoder_input_shapes"].update( + {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width} + ) + + return input_shapes + + +def infer_stable_video_diffusion_shapes_from_diffusers( + input_shapes: Dict[str, Dict[str, int]], + model: "StableVideoDiffusionPipeline", +): + image_encoder_num_channels = model.image_encoder.config.num_channels + unet_num_channels = model.unet.config.in_channels + vae_encoder_num_channels = model.vae.config.in_channels + vae_decoder_num_channels = model.vae.config.latent_channels + vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8 + height = input_shapes["unet_input_shapes"]["height"] + scaled_height = height // vae_scale_factor + width = input_shapes["unet_input_shapes"]["width"] + scaled_width = width // vae_scale_factor + import pdb + pdb.set_trace() + + input_shapes["image_encoder_input_shapes"].update({"num_channels": image_encoder_num_channels}) + input_shapes["unet_input_shapes"].update( + {"num_channels": unet_num_channels, "height": scaled_height, "width": scaled_width} + ) + input_shapes["vae_encoder_input_shapes"].update( + {"num_channels": vae_encoder_num_channels, "height": height, "width": width} + ) + input_shapes["vae_decoder_input_shapes"].update( + {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width} + ) + + default_num_frames = model.unet.config.num_frames + if input_shapes["unet_input_shapes"]["num_frames"] is None: + input_shapes["unet_input_shapes"]["num_frames"] = default_num_frames + input_shapes["vae_decoder_input_shapes"]["num_frames"] = default_num_frames + + + return input_shapes + + def build_stable_diffusion_components_mandatory_shapes( batch_size: Optional[int] = None, sequence_length: Optional[int] = None, @@ -177,10 +273,11 @@ def build_stable_video_diffusion_components_mandatory_shapes( def get_stable_diffusion_models_for_export( pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"], task: str, - text_encoder_input_shapes: Dict[str, int], unet_input_shapes: Dict[str, int], vae_encoder_input_shapes: Dict[str, int], vae_decoder_input_shapes: Dict[str, int], + text_encoder_input_shapes: Optional[Dict[str, int]] = None, + image_encoder_input_shapes: Optional[Dict[str, int]] = None, dynamic_batch_size: Optional[bool] = False, ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "NeuronConfig"]]: """ @@ -194,14 +291,16 @@ def get_stable_diffusion_models_for_export( The model to export. task (`str`): Task name, should be either "stable-diffusion" or "stable-diffusion-xl". - text_encoder_input_shapes (`Dict[str, int]`): - Static shapes used for compiling text encoder. unet_input_shapes (`Dict[str, int]`): Static shapes used for compiling unet. vae_encoder_input_shapes (`Dict[str, int]`): Static shapes used for compiling vae encoder. vae_decoder_input_shapes (`Dict[str, int]`): Static shapes used for compiling vae decoder. + text_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`): + Static shapes used for compiling text encoder. + image_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`): + Static shapes used for compiling image encoder. dynamic_batch_size (`bool`, defaults to `False`): Whether the Neuron compiled model supports dynamic batch size. @@ -240,6 +339,22 @@ def get_stable_diffusion_models_for_export( **text_encoder_input_shapes, ) models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2) + + # Image encoder + if DIFFUSION_MODEL_IMAGE_ENCODER_NAME in models_for_export: + image_encoder = models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] + image_encoder_config_constructor = TasksManager.get_exporter_config_constructor( + model=image_encoder, exporter="neuron", task="feature-extraction" + ) + import pdb + pdb.set_trace() + image_encoder_neuron_config = image_encoder_config_constructor( + image_encoder.config, + task="feature-extraction", + dynamic_batch_size=dynamic_batch_size, + **image_encoder_input_shapes, + ) + models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = (image_encoder, image_encoder_neuron_config) # U-NET unet = models_for_export[DIFFUSION_MODEL_UNET_NAME] @@ -292,7 +407,7 @@ def get_stable_diffusion_models_for_export( def _get_submodels_for_export_stable_diffusion( - pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"], + pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline", "StableVideoDiffusionPipeline"], task: str, ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]: """ @@ -307,7 +422,8 @@ def _get_submodels_for_export_stable_diffusion( projection_dim = pipeline.text_encoder.config.projection_dim # Text encoders - if pipeline.text_encoder is not None: + text_encoder = getattr(pipeline, "text_encoder", None) + if text_encoder is not None: if is_sdxl: pipeline.text_encoder.config.output_hidden_states = True models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_NAME, copy.deepcopy(pipeline.text_encoder))) @@ -316,6 +432,12 @@ def _get_submodels_for_export_stable_diffusion( if text_encoder_2 is not None: text_encoder_2.config.output_hidden_states = True models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2))) + + # Image encoder + image_encoder = getattr(pipeline, "image_encoder", None) + if image_encoder is not None: + image_encoder.config.output_hidden_states = True + models_for_export.append((DIFFUSION_MODEL_IMAGE_ENCODER_NAME, copy.deepcopy(image_encoder))) # U-NET pipeline.unet.set_attn_processor(AttnProcessor()) diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py index c859ba71b..d115995b6 100644 --- a/optimum/neuron/utils/__init__.py +++ b/optimum/neuron/utils/__init__.py @@ -18,6 +18,7 @@ DECODER_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, DIFFUSION_MODEL_TEXT_ENCODER_NAME, + DIFFUSION_MODEL_IMAGE_ENCODER_NAME, DIFFUSION_MODEL_UNET_NAME, DIFFUSION_MODEL_VAE_DECODER_NAME, DIFFUSION_MODEL_VAE_ENCODER_NAME, diff --git a/optimum/neuron/utils/constant.py b/optimum/neuron/utils/constant.py index edc6eebb8..38a497f4c 100644 --- a/optimum/neuron/utils/constant.py +++ b/optimum/neuron/utils/constant.py @@ -19,6 +19,7 @@ DECODER_NAME = "decoder" DIFFUSION_MODEL_TEXT_ENCODER_NAME = "text_encoder" DIFFUSION_MODEL_TEXT_ENCODER_2_NAME = "text_encoder_2" +DIFFUSION_MODEL_IMAGE_ENCODER_NAME = "image_encoder" DIFFUSION_MODEL_UNET_NAME = "unet" DIFFUSION_MODEL_VAE_ENCODER_NAME = "vae_encoder" DIFFUSION_MODEL_VAE_DECODER_NAME = "vae_decoder"