[Inference] Neuron cache for traced torchscript models (encoders, sta…

…ble diffusion) (#510) * poc: cache single model * poc: work for single model * sd caching cli support * fix for single * found the unmatch issue, mark! * sd support completed * fix style * fix * placeholder for tests * fix doc * add tests * fix tests * all inf2 tests clear up * fix compiler dir path in test * add peft to test requirement * fix error message assertion for inf1 * applied comments of David * Update optimum/commands/export/neuronx.py * Update optimum/exporters/neuron/convert.py * Update optimum/exporters/neuron/convert.py * Update optimum/neuron/utils/hub_neuronx_cache.py * apply suggestions from Michael * tolerance * exclude flaky for now
huggingface · Mar 28, 2024 · bf46e2a · bf46e2a
1 parent d06b07e
commit bf46e2a
Show file tree

Hide file tree

Showing 18 changed files with 835 additions and 237 deletions.
diff --git a/optimum/commands/export/neuron.py b/optimum/commands/export/neuron.py
@@ -68,6 +68,11 @@ def parse_args_neuron(parser: "ArgumentParser"):
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
     optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--disable_neuron_cache",
+        action="store_true",
+        help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
+    )
     optional_group.add_argument(
         "--trust-remote-code",
         action="store_true",
@@ -79,7 +84,7 @@ def parse_args_neuron(parser: "ArgumentParser"):
         help="Path indicating the directory where to store intermediary files generated by Neuron compiler.",
     )
     optional_group.add_argument(
-        "--disable-weights-neff-inline",
+        "--inline-weights-neff",
         action="store_true",
         help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.",
     )

diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
@@ -74,7 +74,17 @@ def parse_args_neuronx(parser: "ArgumentParser"):
         default=None,
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
-    optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="Path to a directory in which a downloaded pretrained PyTorch model weights have been cached.",
+    )
+    optional_group.add_argument(
+        "--disable_neuron_cache",
+        action="store_true",
+        help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
+    )
     optional_group.add_argument(
         "--trust-remote-code",
         action="store_true",
@@ -86,9 +96,9 @@ def parse_args_neuronx(parser: "ArgumentParser"):
         help="Path indicating the directory where to store intermediary files generated by Neuronx compiler.",
     )
     optional_group.add_argument(
-        "--disable-weights-neff-inline",
+        "--inline-weights-neff",
         action="store_true",
-        help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.",
+        help="Whether to inline the weights / neff graph. It is possible to replace weights of neuron-compiled models only when the weights-neff inlining has been disabled during the compilation. So the caching will not work when this option is enabled.",
     )
     optional_group.add_argument(
         "--disable-validation",

diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
@@ -219,7 +219,7 @@ class CustomCacheRepoCommand(BaseOptimumCLICommand):
         ),
         CommandInfo(
             name="set",
-            help="Set the name of the Neuron cache repo to use locally (trainium only).",
+            help="Set the name of the Neuron cache repo to use locally.",
             subcommand_class=SetCustomCacheRepoCommand,
         ),
         CommandInfo(

diff --git a/optimum/exporters/neuron/__init__.py b/optimum/exporters/neuron/__init__.py
@@ -21,31 +21,31 @@
     "__main__": [
         "infer_stable_diffusion_shapes_from_diffusers",
         "main_export",
-        "normalize_input_shapes",
         "normalize_stable_diffusion_input_shapes",
     ],
     "base": ["NeuronDefaultConfig"],
     "convert": ["export", "export_models", "validate_model_outputs", "validate_models_outputs"],
     "utils": [
-        "DiffusersPretrainedConfig",
         "build_stable_diffusion_components_mandatory_shapes",
         "get_stable_diffusion_models_for_export",
+        "replace_stable_diffusion_submodels",
+        "get_submodels_for_export_stable_diffusion",
     ],
 }
 
 if TYPE_CHECKING:
     from .__main__ import (
         infer_stable_diffusion_shapes_from_diffusers,
         main_export,
-        normalize_input_shapes,
         normalize_stable_diffusion_input_shapes,
     )
     from .base import NeuronDefaultConfig
     from .convert import export, export_models, validate_model_outputs, validate_models_outputs
     from .utils import (
-        DiffusersPretrainedConfig,
         build_stable_diffusion_components_mandatory_shapes,
         get_stable_diffusion_models_for_export,
+        get_submodels_for_export_stable_diffusion,
+        replace_stable_diffusion_submodels,
     )
 else:
     import sys

diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -24,7 +24,6 @@
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, PretrainedConfig
 
-from ...neuron import NeuronModelForCausalLM
 from ...neuron.utils import (
     DECODER_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_NAME,
@@ -38,7 +37,9 @@
     is_neuronx_available,
 )
 from ...neuron.utils.misc import maybe_save_preprocessors
-from ...neuron.utils.version_utils import check_compiler_compatibility_for_stable_diffusion
+from ...neuron.utils.version_utils import (
+    check_compiler_compatibility_for_stable_diffusion,
+)
 from ...utils import is_diffusers_available, logging
 from ..error_utils import AtolError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
@@ -47,6 +48,7 @@
 from .model_configs import *  # noqa: F403
 from .utils import (
     build_stable_diffusion_components_mandatory_shapes,
+    check_mandatory_input_shapes,
     get_encoder_decoder_models_for_export,
     get_stable_diffusion_models_for_export,
     replace_stable_diffusion_submodels,
@@ -72,7 +74,7 @@
     from transformers import PreTrainedModel
 
     if is_diffusers_available():
-        from diffusers import DiffusionPipeline, StableDiffusionPipeline
+        from diffusers import DiffusionPipeline, ModelMixin, StableDiffusionPipeline
 
 
 logger = logging.get_logger()
@@ -209,24 +211,24 @@ def infer_stable_diffusion_shapes_from_diffusers(
     vae_encoder_num_channels = model.vae.config.in_channels
     vae_decoder_num_channels = model.vae.config.latent_channels
     vae_scale_factor = 2 ** (len(model.vae.config.block_out_channels) - 1) or 8
-    height = input_shapes["unet_input_shapes"]["height"]
+    height = input_shapes["unet"]["height"]
     scaled_height = height // vae_scale_factor
-    width = input_shapes["unet_input_shapes"]["width"]
+    width = input_shapes["unet"]["width"]
     scaled_width = width // vae_scale_factor
 
-    input_shapes["text_encoder_input_shapes"].update({"sequence_length": sequence_length})
-    input_shapes["unet_input_shapes"].update(
+    input_shapes["text_encoder"].update({"sequence_length": sequence_length})
+    if hasattr(model, "text_encoder_2"):
+        input_shapes["text_encoder_2"] = input_shapes["text_encoder"]
+    input_shapes["unet"].update(
         {
             "sequence_length": sequence_length,
             "num_channels": unet_num_channels,
             "height": scaled_height,
             "width": scaled_width,
         }
     )
-    input_shapes["vae_encoder_input_shapes"].update(
-        {"num_channels": vae_encoder_num_channels, "height": height, "width": width}
-    )
-    input_shapes["vae_decoder_input_shapes"].update(
+    input_shapes["vae_encoder"].update({"num_channels": vae_encoder_num_channels, "height": height, "width": width})
+    input_shapes["vae_decoder"].update(
         {"num_channels": vae_decoder_num_channels, "height": scaled_height, "width": scaled_width}
     )
 
@@ -290,6 +292,7 @@ def _get_submodels_and_neuron_configs(
             task=task,
             library_name=library_name,
         )
+        input_shapes = check_mandatory_input_shapes(neuron_config_constructor, task, input_shapes)
         neuron_config = neuron_config_constructor(model.config, dynamic_batch_size=dynamic_batch_size, **input_shapes)
         model_name = getattr(model, "name_or_path", None) or model_name_or_path
         model_name = model_name.split("/")[-1] if model_name else model.config.model_type
@@ -355,12 +358,15 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     models_and_neuron_configs = get_stable_diffusion_models_for_export(
         pipeline=model,
         task=task,
+        text_encoder_input_shapes=input_shapes["text_encoder"],
+        unet_input_shapes=input_shapes["unet"],
+        vae_encoder_input_shapes=input_shapes["vae_encoder"],
+        vae_decoder_input_shapes=input_shapes["vae_decoder"],
         dynamic_batch_size=dynamic_batch_size,
         lora_model_ids=lora_model_ids,
         lora_weight_names=lora_weight_names,
         lora_adapter_names=lora_adapter_names,
         lora_scales=lora_scales,
-        **input_shapes,
     )
     output_model_names = {
         DIFFUSION_MODEL_UNET_NAME: os.path.join(DIFFUSION_MODEL_UNET_NAME, NEURON_FILE_NAME),
@@ -416,12 +422,14 @@ def main_export(
     model_name_or_path: str,
     output: Union[str, Path],
     compiler_kwargs: Dict[str, Any],
+    model: Optional[Union["PreTrainedModel", "ModelMixin"]] = None,
     task: str = "auto",
     dynamic_batch_size: bool = False,
     atol: Optional[float] = None,
     cache_dir: Optional[str] = None,
+    disable_neuron_cache: Optional[bool] = False,
     compiler_workdir: Optional[Union[str, Path]] = None,
-    inline_weights_to_neff: bool = True,
+    inline_weights_to_neff: bool = False,
     optlevel: str = "2",
     trust_remote_code: bool = False,
     subfolder: str = "",
@@ -463,7 +471,8 @@ def main_export(
         "framework": "pt",
         "library_name": library_name,
     }
-    model = TasksManager.get_model_from_task(**model_kwargs)
+    if model is None:
+        model = TasksManager.get_model_from_task(**model_kwargs)
 
     models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs(
         model=model,
@@ -486,11 +495,13 @@ def main_export(
     _, neuron_outputs = export_models(
         models_and_neuron_configs=models_and_neuron_configs,
         output_dir=output,
+        disable_neuron_cache=disable_neuron_cache,
         compiler_workdir=compiler_workdir,
         inline_weights_to_neff=inline_weights_to_neff,
         optlevel=optlevel,
         output_file_names=output_model_names,
         compiler_kwargs=compiler_kwargs,
+        model_name_or_path=model_name_or_path,
     )
 
     # Validate compiled model
@@ -537,6 +548,8 @@ def decoder_export(
     output: Union[str, Path],
     **kwargs,
 ):
+    from ...neuron import NeuronModelForCausalLM
+
     output = Path(output)
     if not output.parent.exists():
         output.parent.mkdir(parents=True)
@@ -583,6 +596,7 @@ def main():
             return
         submodels = None
 
+    disable_neuron_cache = args.disable_neuron_cache
     compiler_kwargs = infer_compiler_kwargs(args)
     optional_outputs = customize_optional_outputs(args)
     optlevel = parse_optlevel(args)
@@ -595,8 +609,9 @@ def main():
         dynamic_batch_size=args.dynamic_batch_size,
         atol=args.atol,
         cache_dir=args.cache_dir,
+        disable_neuron_cache=disable_neuron_cache,
         compiler_workdir=args.compiler_workdir,
-        inline_weights_to_neff=not args.disable_weights_neff_inline,
+        inline_weights_to_neff=args.inline_weights_neff,
         optlevel=optlevel,
         trust_remote_code=args.trust_remote_code,
         subfolder=args.subfolder,

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
@@ -157,7 +157,7 @@ def __init__(
         audio_sequence_length: Optional[int] = None,
         point_batch_size: Optional[int] = None,
         nb_points_per_image: Optional[int] = None,
-        num_beams: int = 1,
+        num_beams: Optional[int] = None,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
         # TODO: add custom dtype after optimum 1.13 release