Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
JingyaHuang committed Jan 9, 2024
1 parent 22d4565 commit 8b1c101
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 66 deletions.
1 change: 1 addition & 0 deletions optimum/exporters/neuron/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from .__main__ import (
infer_stable_diffusion_shapes_from_diffusers,
infer_stable_video_diffusion_shapes_from_diffusers,
main_export,
normalize_input_shapes,
)
Expand Down
114 changes: 54 additions & 60 deletions optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional, Union

from requests.exceptions import ConnectionError as RequestsConnectionError
from transformers import AutoConfig, PretrainedConfig

from ...neuron.utils import (
DECODER_NAME,
DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
DIFFUSION_MODEL_TEXT_ENCODER_NAME,
DIFFUSION_MODEL_IMAGE_ENCODER_NAME,
DIFFUSION_MODEL_UNET_NAME,
DIFFUSION_MODEL_VAE_DECODER_NAME,
DIFFUSION_MODEL_VAE_ENCODER_NAME,
Expand All @@ -44,6 +44,9 @@
from .convert import export_models, validate_models_outputs
from .model_configs import * # noqa: F403
from .utils import (
infer_task,
infer_stable_diffusion_shapes_from_diffusers,
infer_stable_video_diffusion_shapes_from_diffusers,
build_stable_diffusion_components_mandatory_shapes,
build_stable_video_diffusion_components_mandatory_shapes,
get_encoder_decoder_models_for_export,
Expand All @@ -63,15 +66,12 @@

NEURON_COMPILER = "Neuronx"

if is_diffusers_available():
from diffusers import StableDiffusionXLPipeline


if TYPE_CHECKING:
from transformers import PreTrainedModel

if is_diffusers_available():
from diffusers import DiffusionPipeline, StableDiffusionPipeline
from diffusers import DiffusionPipeline


logger = logging.get_logger()
Expand All @@ -91,22 +91,6 @@ def infer_compiler_kwargs(args: argparse.Namespace) -> Dict[str, Any]:
return compiler_kwargs


def infer_task(task: str, model_name_or_path: str) -> str:
if task == "auto":
try:
task = TasksManager.infer_task_from_model(model_name_or_path)
except KeyError as e:
raise KeyError(
"The task could not be automatically inferred. Please provide the argument --task with the task "
f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
except RequestsConnectionError as e:
raise RequestsConnectionError(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
return task


def customize_optional_outputs(args: argparse.Namespace) -> Dict[str, bool]:
"""
Customize optional outputs of the traced model, eg. if `output_attentions=True`, the attentions tensors will be traced.
Expand Down Expand Up @@ -207,44 +191,6 @@ def _normalize_stable_video_diffusion_input_shapes(
return input_shapes


def infer_stable_diffusion_shapes_from_diffusers(
input_shapes: Dict[str, Dict[str, int]],
model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
):
if model.tokenizer is not None:
sequence_length = model.tokenizer.model_max_length
elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
sequence_length = model.tokenizer_2.model_max_length
else:
raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.")
unet_num_channels = model.unet.config.in_channels
vae_encoder_num_channels = model.vae.config.in_channels
vae_decoder_num_channels = model.vae.config.latent_channels
vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
height = input_shapes["unet_input_shapes"]["height"]
scaled_height = height // vae_scale_factor
width = input_shapes["unet_input_shapes"]["width"]
scaled_width = width // vae_scale_factor

input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
input_shapes["unet_input_shapes"].update(
{
"sequence_length": sequence_length,
"num_channels": unet_num_channels,
"height": scaled_height,
"width": scaled_width,
}
)
input_shapes["vae_encoder_input_shapes"].update(
{"num_channels": vae_encoder_num_channels, "height": height, "width": width}
)
input_shapes["vae_decoder_input_shapes"].update(
{"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
)

return input_shapes


def _get_submodels_and_neuron_configs(
model: Union["PreTrainedModel", "DiffusionPipeline"],
input_shapes: Dict[str, int],
Expand All @@ -261,7 +207,16 @@ def _get_submodels_and_neuron_configs(
getattr(model.config, "is_encoder_decoder", False) if isinstance(model.config, PretrainedConfig) else False
)

if is_stable_diffusion:
if task == "stable-video-diffusion":
# TODO: Enable optional outputs for Stable Video Diffusion
if output_attentions or output_hidden_states:
raise ValueError(
f"`output_attentions` and `output_hidden_states` are not supported by the {task} task yet."
)
models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_stable_video_diffusion(
model, input_shapes, task, output, dynamic_batch_size,
)
elif is_stable_diffusion:
# TODO: Enable optional outputs for Stable Diffusion
if output_attentions or output_hidden_states:
raise ValueError(
Expand Down Expand Up @@ -351,6 +306,45 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
return models_and_neuron_configs, output_model_names


def _get_submodels_and_neuron_configs_for_stable_video_diffusion(
model: Union["PreTrainedModel", "DiffusionPipeline"],
input_shapes: Dict[str, int],
task: str,
output: Path,
dynamic_batch_size: bool = False,
):
if is_neuron_available():
raise RuntimeError(
"Stable diffusion export is not supported by neuron-cc on inf1, please use neuronx-cc on either inf2/trn1 instead."
)
input_shapes = infer_stable_video_diffusion_shapes_from_diffusers(input_shapes, model)

# Saving the model config and preprocessor as this is needed sometimes.
model.scheduler.save_pretrained(output.joinpath("scheduler"))
if getattr(model, "feature_extractor", None) is not None:
model.feature_extractor.save_pretrained(output.joinpath("feature_extractor"))
model.save_config(output)

models_and_neuron_configs = get_stable_diffusion_models_for_export(
pipeline=model,
task=task,
dynamic_batch_size=dynamic_batch_size,
**input_shapes,
)
output_model_names = {
DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
DIFFUSION_MODEL_VAE_ENCODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_ENCODER_NAME, NEURON_FILE_NAME),
DIFFUSION_MODEL_VAE_DECODER_NAME: os.path.join(DIFFUSION_MODEL_VAE_DECODER_NAME, NEURON_FILE_NAME),
}
if getattr(model, "image_encoder", None) is not None:
output_model_names[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = os.path.join(
DIFFUSION_MODEL_IMAGE_ENCODER_NAME, NEURON_FILE_NAME
)
del model

return models_and_neuron_configs, output_model_names


def _get_submodels_and_neuron_configs_for_encoder_decoder(
model: "PreTrainedModel",
input_shapes: Dict[str, int],
Expand Down
134 changes: 128 additions & 6 deletions optimum/exporters/neuron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
from collections import OrderedDict
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
from requests.exceptions import ConnectionError as RequestsConnectionError

import torch
from transformers import PretrainedConfig
Expand All @@ -26,6 +27,7 @@
DECODER_NAME,
DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
DIFFUSION_MODEL_TEXT_ENCODER_NAME,
DIFFUSION_MODEL_IMAGE_ENCODER_NAME,
DIFFUSION_MODEL_UNET_NAME,
DIFFUSION_MODEL_VAE_DECODER_NAME,
DIFFUSION_MODEL_VAE_ENCODER_NAME,
Expand All @@ -52,7 +54,11 @@
f"We found an older version of diffusers {_diffusers_version} but we require diffusers to be >= {DIFFUSERS_MINIMUM_VERSION}. "
"Please update diffusers by running `pip install --upgrade diffusers`"
)
from diffusers import UNet2DConditionModel
from diffusers import (
UNet2DConditionModel,
StableDiffusionXLPipeline,
StableVideoDiffusionPipeline,
)
from diffusers.models.attention_processor import (
Attention,
AttnAddedKVProcessor,
Expand Down Expand Up @@ -86,6 +92,96 @@ def to_dict(self):
return output


def infer_task(task: str, model_name_or_path: str) -> str:
if task == "auto":
try:
task = TasksManager.infer_task_from_model(model_name_or_path)
except KeyError as e:
raise KeyError(
"The task could not be automatically inferred. Please provide the argument --task with the task "
f"from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
except RequestsConnectionError as e:
raise RequestsConnectionError(
f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}"
)
return task


def infer_stable_diffusion_shapes_from_diffusers(
input_shapes: Dict[str, Dict[str, int]],
model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
):
if getattr(model, "tokenizer", None):
sequence_length = model.tokenizer.model_max_length
elif hasattr(model, "tokenizer_2") and model.tokenizer_2 is not None:
sequence_length = model.tokenizer_2.model_max_length
else:
raise AttributeError(f"Cannot infer sequence_length from {type(model)} as there is no tokenizer as attribute.")
unet_num_channels = model.unet.config.in_channels
vae_encoder_num_channels = model.vae.config.in_channels
vae_decoder_num_channels = model.vae.config.latent_channels
vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
height = input_shapes["unet_input_shapes"]["height"]
scaled_height = height // vae_scale_factor
width = input_shapes["unet_input_shapes"]["width"]
scaled_width = width // vae_scale_factor

input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
input_shapes["unet_input_shapes"].update(
{
"sequence_length": sequence_length,
"num_channels": unet_num_channels,
"height": scaled_height,
"width": scaled_width,
}
)
input_shapes["vae_encoder_input_shapes"].update(
{"num_channels": vae_encoder_num_channels, "height": height, "width": width}
)
input_shapes["vae_decoder_input_shapes"].update(
{"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
)

return input_shapes


def infer_stable_video_diffusion_shapes_from_diffusers(
input_shapes: Dict[str, Dict[str, int]],
model: "StableVideoDiffusionPipeline",
):
image_encoder_num_channels = model.image_encoder.config.num_channels
unet_num_channels = model.unet.config.in_channels
vae_encoder_num_channels = model.vae.config.in_channels
vae_decoder_num_channels = model.vae.config.latent_channels
vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
height = input_shapes["unet_input_shapes"]["height"]
scaled_height = height // vae_scale_factor
width = input_shapes["unet_input_shapes"]["width"]
scaled_width = width // vae_scale_factor
import pdb
pdb.set_trace()

input_shapes["image_encoder_input_shapes"].update({"num_channels": image_encoder_num_channels})
input_shapes["unet_input_shapes"].update(
{"num_channels": unet_num_channels, "height": scaled_height, "width": scaled_width}
)
input_shapes["vae_encoder_input_shapes"].update(
{"num_channels": vae_encoder_num_channels, "height": height, "width": width}
)
input_shapes["vae_decoder_input_shapes"].update(
{"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
)

default_num_frames = model.unet.config.num_frames
if input_shapes["unet_input_shapes"]["num_frames"] is None:
input_shapes["unet_input_shapes"]["num_frames"] = default_num_frames
input_shapes["vae_decoder_input_shapes"]["num_frames"] = default_num_frames


return input_shapes


def build_stable_diffusion_components_mandatory_shapes(
batch_size: Optional[int] = None,
sequence_length: Optional[int] = None,
Expand Down Expand Up @@ -177,10 +273,11 @@ def build_stable_video_diffusion_components_mandatory_shapes(
def get_stable_diffusion_models_for_export(
pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
task: str,
text_encoder_input_shapes: Dict[str, int],
unet_input_shapes: Dict[str, int],
vae_encoder_input_shapes: Dict[str, int],
vae_decoder_input_shapes: Dict[str, int],
text_encoder_input_shapes: Optional[Dict[str, int]] = None,
image_encoder_input_shapes: Optional[Dict[str, int]] = None,
dynamic_batch_size: Optional[bool] = False,
) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "NeuronConfig"]]:
"""
Expand All @@ -194,14 +291,16 @@ def get_stable_diffusion_models_for_export(
The model to export.
task (`str`):
Task name, should be either "stable-diffusion" or "stable-diffusion-xl".
text_encoder_input_shapes (`Dict[str, int]`):
Static shapes used for compiling text encoder.
unet_input_shapes (`Dict[str, int]`):
Static shapes used for compiling unet.
vae_encoder_input_shapes (`Dict[str, int]`):
Static shapes used for compiling vae encoder.
vae_decoder_input_shapes (`Dict[str, int]`):
Static shapes used for compiling vae decoder.
text_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`):
Static shapes used for compiling text encoder.
image_encoder_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`):
Static shapes used for compiling image encoder.
dynamic_batch_size (`bool`, defaults to `False`):
Whether the Neuron compiled model supports dynamic batch size.
Expand Down Expand Up @@ -240,6 +339,22 @@ def get_stable_diffusion_models_for_export(
**text_encoder_input_shapes,
)
models_for_export[DIFFUSION_MODEL_TEXT_ENCODER_2_NAME] = (text_encoder_2, text_encoder_neuron_config_2)

# Image encoder
if DIFFUSION_MODEL_IMAGE_ENCODER_NAME in models_for_export:
image_encoder = models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME]
image_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
model=image_encoder, exporter="neuron", task="feature-extraction"
)
import pdb
pdb.set_trace()
image_encoder_neuron_config = image_encoder_config_constructor(
image_encoder.config,
task="feature-extraction",
dynamic_batch_size=dynamic_batch_size,
**image_encoder_input_shapes,
)
models_for_export[DIFFUSION_MODEL_IMAGE_ENCODER_NAME] = (image_encoder, image_encoder_neuron_config)

# U-NET
unet = models_for_export[DIFFUSION_MODEL_UNET_NAME]
Expand Down Expand Up @@ -292,7 +407,7 @@ def get_stable_diffusion_models_for_export(


def _get_submodels_for_export_stable_diffusion(
pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline"],
pipeline: Union["StableDiffusionPipeline", "StableDiffusionXLImg2ImgPipeline", "StableVideoDiffusionPipeline"],
task: str,
) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
"""
Expand All @@ -307,7 +422,8 @@ def _get_submodels_for_export_stable_diffusion(
projection_dim = pipeline.text_encoder.config.projection_dim

# Text encoders
if pipeline.text_encoder is not None:
text_encoder = getattr(pipeline, "text_encoder", None)
if text_encoder is not None:
if is_sdxl:
pipeline.text_encoder.config.output_hidden_states = True
models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_NAME, copy.deepcopy(pipeline.text_encoder)))
Expand All @@ -316,6 +432,12 @@ def _get_submodels_for_export_stable_diffusion(
if text_encoder_2 is not None:
text_encoder_2.config.output_hidden_states = True
models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2)))

# Image encoder
image_encoder = getattr(pipeline, "image_encoder", None)
if image_encoder is not None:
image_encoder.config.output_hidden_states = True
models_for_export.append((DIFFUSION_MODEL_IMAGE_ENCODER_NAME, copy.deepcopy(image_encoder)))

# U-NET
pipeline.unet.set_attn_processor(AttnProcessor())
Expand Down
Loading

0 comments on commit 8b1c101

Please sign in to comment.