Skip to content

Commit

Permalink
[Inference] Neuron cache for traced torchscript models (encoders, sta…
Browse files Browse the repository at this point in the history
…ble diffusion) (#510)

* poc: cache single model

* poc: work for single model

* sd caching cli support

* fix for single

* found the unmatch issue, mark!

* sd support completed

* fix style

* fix

* placeholder for tests

* fix doc

* add tests

* fix tests

* all inf2 tests clear up

* fix compiler dir path in test

* add peft to test requirement

* fix error message assertion for inf1

* applied comments of David

* Update optimum/commands/export/neuronx.py

* Update optimum/exporters/neuron/convert.py

* Update optimum/exporters/neuron/convert.py

* Update optimum/neuron/utils/hub_neuronx_cache.py

* apply suggestions from Michael

* tolerance

* exclude flaky for now
  • Loading branch information
JingyaHuang authored Mar 28, 2024
1 parent d06b07e commit bf46e2a
Show file tree
Hide file tree
Showing 18 changed files with 835 additions and 237 deletions.
7 changes: 6 additions & 1 deletion optimum/commands/export/neuron.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def parse_args_neuron(parser: "ArgumentParser"):
help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
)
optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
optional_group.add_argument(
"--disable_neuron_cache",
action="store_true",
help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
)
optional_group.add_argument(
"--trust-remote-code",
action="store_true",
Expand All @@ -79,7 +84,7 @@ def parse_args_neuron(parser: "ArgumentParser"):
help="Path indicating the directory where to store intermediary files generated by Neuron compiler.",
)
optional_group.add_argument(
"--disable-weights-neff-inline",
"--inline-weights-neff",
action="store_true",
help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.",
)
Expand Down
16 changes: 13 additions & 3 deletions optimum/commands/export/neuronx.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,17 @@ def parse_args_neuronx(parser: "ArgumentParser"):
default=None,
help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
)
optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
optional_group.add_argument(
"--cache_dir",
type=str,
default=None,
help="Path to a directory in which a downloaded pretrained PyTorch model weights have been cached.",
)
optional_group.add_argument(
"--disable_neuron_cache",
action="store_true",
help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
)
optional_group.add_argument(
"--trust-remote-code",
action="store_true",
Expand All @@ -86,9 +96,9 @@ def parse_args_neuronx(parser: "ArgumentParser"):
help="Path indicating the directory where to store intermediary files generated by Neuronx compiler.",
)
optional_group.add_argument(
"--disable-weights-neff-inline",
"--inline-weights-neff",
action="store_true",
help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.",
help="Whether to inline the weights / neff graph. It is possible to replace weights of neuron-compiled models only when the weights-neff inlining has been disabled during the compilation. So the caching will not work when this option is enabled.",
)
optional_group.add_argument(
"--disable-validation",
Expand Down
2 changes: 1 addition & 1 deletion optimum/commands/neuron/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ class CustomCacheRepoCommand(BaseOptimumCLICommand):
),
CommandInfo(
name="set",
help="Set the name of the Neuron cache repo to use locally (trainium only).",
help="Set the name of the Neuron cache repo to use locally.",
subcommand_class=SetCustomCacheRepoCommand,
),
CommandInfo(
Expand Down
8 changes: 4 additions & 4 deletions optimum/exporters/neuron/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,31 +21,31 @@
"__main__": [
"infer_stable_diffusion_shapes_from_diffusers",
"main_export",
"normalize_input_shapes",
"normalize_stable_diffusion_input_shapes",
],
"base": ["NeuronDefaultConfig"],
"convert": ["export", "export_models", "validate_model_outputs", "validate_models_outputs"],
"utils": [
"DiffusersPretrainedConfig",
"build_stable_diffusion_components_mandatory_shapes",
"get_stable_diffusion_models_for_export",
"replace_stable_diffusion_submodels",
"get_submodels_for_export_stable_diffusion",
],
}

if TYPE_CHECKING:
from .__main__ import (
infer_stable_diffusion_shapes_from_diffusers,
main_export,
normalize_input_shapes,
normalize_stable_diffusion_input_shapes,
)
from .base import NeuronDefaultConfig
from .convert import export, export_models, validate_model_outputs, validate_models_outputs
from .utils import (
DiffusersPretrainedConfig,
build_stable_diffusion_components_mandatory_shapes,
get_stable_diffusion_models_for_export,
get_submodels_for_export_stable_diffusion,
replace_stable_diffusion_submodels,
)
else:
import sys
Expand Down
45 changes: 30 additions & 15 deletions optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from requests.exceptions import ConnectionError as RequestsConnectionError
from transformers import AutoConfig, PretrainedConfig

from ...neuron import NeuronModelForCausalLM
from ...neuron.utils import (
DECODER_NAME,
DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
Expand All @@ -38,7 +37,9 @@
is_neuronx_available,
)
from ...neuron.utils.misc import maybe_save_preprocessors
from ...neuron.utils.version_utils import check_compiler_compatibility_for_stable_diffusion
from ...neuron.utils.version_utils import (
check_compiler_compatibility_for_stable_diffusion,
)
from ...utils import is_diffusers_available, logging
from ..error_utils import AtolError, OutputMatchError, ShapeError
from ..tasks import TasksManager
Expand All @@ -47,6 +48,7 @@
from .model_configs import * # noqa: F403
from .utils import (
build_stable_diffusion_components_mandatory_shapes,
check_mandatory_input_shapes,
get_encoder_decoder_models_for_export,
get_stable_diffusion_models_for_export,
replace_stable_diffusion_submodels,
Expand All @@ -72,7 +74,7 @@
from transformers import PreTrainedModel

if is_diffusers_available():
from diffusers import DiffusionPipeline, StableDiffusionPipeline
from diffusers import DiffusionPipeline, ModelMixin, StableDiffusionPipeline


logger = logging.get_logger()
Expand Down Expand Up @@ -209,24 +211,24 @@ def infer_stable_diffusion_shapes_from_diffusers(
vae_encoder_num_channels = model.vae.config.in_channels
vae_decoder_num_channels = model.vae.config.latent_channels
vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
height = input_shapes["unet_input_shapes"]["height"]
height = input_shapes["unet"]["height"]
scaled_height = height // vae_scale_factor
width = input_shapes["unet_input_shapes"]["width"]
width = input_shapes["unet"]["width"]
scaled_width = width // vae_scale_factor

input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
input_shapes["unet_input_shapes"].update(
input_shapes["text_encoder"].update({"sequence_length": sequence_length})
if hasattr(model, "text_encoder_2"):
input_shapes["text_encoder_2"] = input_shapes["text_encoder"]
input_shapes["unet"].update(
{
"sequence_length": sequence_length,
"num_channels": unet_num_channels,
"height": scaled_height,
"width": scaled_width,
}
)
input_shapes["vae_encoder_input_shapes"].update(
{"num_channels": vae_encoder_num_channels, "height": height, "width": width}
)
input_shapes["vae_decoder_input_shapes"].update(
input_shapes["vae_encoder"].update({"num_channels": vae_encoder_num_channels, "height": height, "width": width})
input_shapes["vae_decoder"].update(
{"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
)

Expand Down Expand Up @@ -290,6 +292,7 @@ def _get_submodels_and_neuron_configs(
task=task,
library_name=library_name,
)
input_shapes = check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes)
neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes)
model_name = getattr(model, "name_or_path", None) or model_name_or_path
model_name = model_name.split("/")[-1] if model_name else model.config.model_type
Expand Down Expand Up @@ -355,12 +358,15 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
models_and_neuron_configs = get_stable_diffusion_models_for_export(
pipeline=model,
task=task,
text_encoder_input_shapes=input_shapes["text_encoder"],
unet_input_shapes=input_shapes["unet"],
vae_encoder_input_shapes=input_shapes["vae_encoder"],
vae_decoder_input_shapes=input_shapes["vae_decoder"],
dynamic_batch_size=dynamic_batch_size,
lora_model_ids=lora_model_ids,
lora_weight_names=lora_weight_names,
lora_adapter_names=lora_adapter_names,
lora_scales=lora_scales,
**input_shapes,
)
output_model_names = {
DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
Expand Down Expand Up @@ -416,12 +422,14 @@ def main_export(
model_name_or_path: str,
output: Union[str, Path],
compiler_kwargs: Dict[str, Any],
model: Optional[Union["PreTrainedModel", "ModelMixin"]] = None,
task: str = "auto",
dynamic_batch_size: bool = False,
atol: Optional[float] = None,
cache_dir: Optional[str] = None,
disable_neuron_cache: Optional[bool] = False,
compiler_workdir: Optional[Union[str, Path]] = None,
inline_weights_to_neff: bool = True,
inline_weights_to_neff: bool = False,
optlevel: str = "2",
trust_remote_code: bool = False,
subfolder: str = "",
Expand Down Expand Up @@ -463,7 +471,8 @@ def main_export(
"framework": "pt",
"library_name": library_name,
}
model = TasksManager.get_model_from_task(**model_kwargs)
if model is None:
model = TasksManager.get_model_from_task(**model_kwargs)

models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
model=model,
Expand All @@ -486,11 +495,13 @@ def main_export(
_, neuron_outputs = export_models(
models_and_neuron_configs=models_and_neuron_configs,
output_dir=output,
disable_neuron_cache=disable_neuron_cache,
compiler_workdir=compiler_workdir,
inline_weights_to_neff=inline_weights_to_neff,
optlevel=optlevel,
output_file_names=output_model_names,
compiler_kwargs=compiler_kwargs,
model_name_or_path=model_name_or_path,
)

# Validate compiled model
Expand Down Expand Up @@ -537,6 +548,8 @@ def decoder_export(
output: Union[str, Path],
**kwargs,
):
from ...neuron import NeuronModelForCausalLM

output = Path(output)
if not output.parent.exists():
output.parent.mkdir(parents=True)
Expand Down Expand Up @@ -583,6 +596,7 @@ def main():
return
submodels = None

disable_neuron_cache = args.disable_neuron_cache
compiler_kwargs = infer_compiler_kwargs(args)
optional_outputs = customize_optional_outputs(args)
optlevel = parse_optlevel(args)
Expand All @@ -595,8 +609,9 @@ def main():
dynamic_batch_size=args.dynamic_batch_size,
atol=args.atol,
cache_dir=args.cache_dir,
disable_neuron_cache=disable_neuron_cache,
compiler_workdir=args.compiler_workdir,
inline_weights_to_neff=not args.disable_weights_neff_inline,
inline_weights_to_neff=args.inline_weights_neff,
optlevel=optlevel,
trust_remote_code=args.trust_remote_code,
subfolder=args.subfolder,
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def __init__(
audio_sequence_length: Optional[int] = None,
point_batch_size: Optional[int] = None,
nb_points_per_image: Optional[int] = None,
num_beams: int = 1,
num_beams: Optional[int] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
# TODO: add custom dtype after optimum 1.13 release
Expand Down
Loading

0 comments on commit bf46e2a

Please sign in to comment.