diff --git a/optimum/commands/export/neuron.py b/optimum/commands/export/neuron.py index 402498e3b..5172fdb54 100644 --- a/optimum/commands/export/neuron.py +++ b/optimum/commands/export/neuron.py @@ -68,6 +68,11 @@ def parse_args_neuron(parser: "ArgumentParser"): help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.", ) optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.") + optional_group.add_argument( + "--disable_neuron_cache", + action="store_true", + help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).", + ) optional_group.add_argument( "--trust-remote-code", action="store_true", diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py index d54bcbde3..127904e37 100644 --- a/optimum/commands/export/neuronx.py +++ b/optimum/commands/export/neuronx.py @@ -83,7 +83,7 @@ def parse_args_neuronx(parser: "ArgumentParser"): optional_group.add_argument( "--disable_neuron_cache", action="store_true", - help="Whether to disable automatic caching of compiled TorchScript models (not applicable for JIT compilation).", + help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).", ) optional_group.add_argument( "--trust-remote-code", diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py index 747d0b438..e787ce16c 100644 --- a/optimum/exporters/neuron/convert.py +++ b/optimum/exporters/neuron/convert.py @@ -465,7 +465,7 @@ def export_neuronx( config: "NeuronDefaultConfig", output: Path, compiler_workdir: Optional[Path] = None, - inline_weights_to_neff: bool = True, + inline_weights_to_neff: bool = False, optlevel: str = "2", auto_cast: Optional[str] = None, auto_cast_type: str = "bf16", @@ -482,7 +482,7 @@ def export_neuronx( Directory to store the exported Neuron model. compiler_workdir (`Optional[Path]`, defaults to `None`): The directory used by neuronx-cc, where you can find intermediary outputs (neff, weight, hlo...). - inline_weights_to_neff (`bool`, defaults to `True`): + inline_weights_to_neff (`bool`, defaults to `False`): Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff. optlevel (`str`, defaults to `"2"`): The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2". @@ -546,6 +546,12 @@ def export_neuronx( # diffusers specific compiler_args = add_stable_diffusion_compiler_args(config, compiler_args) + if config.dynamic_batch_size is True and not inline_weights_to_neff: + logger.warning( + "Dynamic batching is not yet compatible with the weights/neff non-inlined model. `inline_weights_to_neff` is set to True. If you still want to separate the neff and weights, please set `dynamic_batch_size=False`." + ) + inline_weights_to_neff = True + neuron_model = neuronx.trace( checked_model, dummy_inputs_tuple, @@ -556,10 +562,6 @@ def export_neuronx( ) if config.dynamic_batch_size is True: - if not inline_weights_to_neff: - raise ValueError( - "Dynamic batching is not yet compatible with the weights/neff non-inlined model. Please set `dynamic_batch_size=False` or `inline_weights_to_neff=True`." - ) neuron_model = neuronx.dynamic_batch(neuron_model) # diffusers specific @@ -608,7 +610,7 @@ def export_neuron( config: "NeuronDefaultConfig", output: Path, compiler_workdir: Optional[Path] = None, - inline_weights_to_neff: bool = True, + inline_weights_to_neff: bool = False, auto_cast: Optional[str] = None, auto_cast_type: str = "bf16", disable_fast_relayout: bool = False, @@ -626,7 +628,7 @@ def export_neuron( Directory to store the exported Neuron model. compiler_workdir (`Optional[Path]`, defaults to `None`): The directory used by neuron-cc, where you can find intermediary outputs (neff, weight, hlo...). - inline_weights_to_neff (`bool`, defaults to `True`): + inline_weights_to_neff (`bool`, defaults to `False`): Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff. auto_cast (`Optional[str]`, defaults to `None`): Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `None`, `"matmul"` or `"all"`, you should use `None` to disable any auto-casting, use `"matmul"` to cast FP32 matrix multiplication operations, and use `"all"` to cast all FP32 operations. @@ -666,6 +668,12 @@ def export_neuron( checked_model = config.patch_model_for_export(model, dummy_inputs) compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout) + if config.dynamic_batch_size is True and not inline_weights_to_neff: + logger.warning( + "Dynamic batching is not yet compatible with the weights/neff non-inlined model. `inline_weights_to_neff` is set to True. If you still want to separate the neff and weights, please set `dynamic_batch_size=False`." + ) + inline_weights_to_neff = True + neuron_model = neuron.trace( checked_model, dummy_inputs_tuple, diff --git a/optimum/neuron/utils/hub_neuronx_cache.py b/optimum/neuron/utils/hub_neuronx_cache.py index 94f91ebd2..99daa944f 100644 --- a/optimum/neuron/utils/hub_neuronx_cache.py +++ b/optimum/neuron/utils/hub_neuronx_cache.py @@ -152,10 +152,7 @@ def exists(self, path: str): if self.default_cache.exists(path): return True rel_path = self._rel_path(path) - file_exists = self.api.file_exists(self.repo_id, rel_path) - folder_info = self.api.list_repo_tree(self.repo_id, rel_path) - folder_exists = len(list(folder_info)) > 1 - exists = file_exists or folder_exists + exists = self.api.file_exists(self.repo_id, rel_path) if not exists: logger.warning( f"{rel_path} not found in {self.repo_id}: the corresponding graph will be recompiled." @@ -184,7 +181,7 @@ def download_folder(self, folder_path: str, dst_path: str): folder_info = list(self.api.list_repo_tree(self.repo_id, rel_folder_path)) folder_exists = len(folder_info) > 1 except Exception as e: - logger.warning(f"{rel_folder_path} not found in {self.repo_id}: {e} \nThe model will be recompiled.") + logger.info(f"{rel_folder_path} not found in {self.repo_id}: {e} \nThe model will be recompiled.") folder_exists = False if folder_exists: @@ -239,7 +236,7 @@ def download_file_to_string(self, filename: str, limit: int = None): def get_hub_cache(): HUB_CACHE = "aws-neuron/optimum-neuron-cache" custom_hub_cache = load_custom_cache_repo_name_from_hf_home() - if custom_hub_cache is not None: + if custom_hub_cache is not None and len(custom_hub_cache) > 0: return custom_hub_cache else: return os.getenv("CUSTOM_CACHE_REPO", HUB_CACHE) @@ -351,7 +348,7 @@ def hf_create_compile_cache(cache_url): # Create cache entry in local cache: it can be later synchronized with the hub cache registry_path = default_cache.get_cache_dir_with_cache_key(registry_folder) model_type = entry.config["model_type"] - entry_path = f"{registry_path}/{model_type}/{entry.model_id}" # TODO: this is not applicable for checkpoints with multiple models, eg. stable diffusion + entry_path = f"{registry_path}/{model_type}/{entry.model_id}" config_path = f"{entry_path}/{entry.hash}.json" if not default_cache.exists(config_path): oldmask = os.umask(000) @@ -549,9 +546,14 @@ def build_cache_config( config = exclude_white_list_from_config(config, white_list, neuron_white_list) clean_configs[name] = config - if "unet" in configs: - clean_configs["model_type"] = "stable-diffusion" if len(clean_configs) > 1: + if "unet" in configs: + # stable diffusion + clean_configs["model_type"] = "stable-diffusion" + else: + # seq-to-seq + clean_configs["model_type"] = next(iter(clean_configs.values()))["model_type"] + return clean_configs else: return next(iter(clean_configs.values())) diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py index 1c400bef8..84ee0d669 100644 --- a/tests/cache/test_neuronx_cache.py +++ b/tests/cache/test_neuronx_cache.py @@ -29,6 +29,8 @@ from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSequenceClassification, NeuronStableDiffusionPipeline from optimum.neuron.utils import get_hub_cached_entries, synchronize_hub_cache from optimum.neuron.utils.cache_utils import ( + CACHE_REPO_FILENAME, + HF_HOME, load_custom_cache_repo_name_from_hf_home, set_custom_cache_repo_name_in_hf_home, ) @@ -48,7 +50,9 @@ def cache_repos(): cache_repo_id = api.create_repo(cache_repo_id, private=True).repo_id api.repo_info(cache_repo_id, repo_type="model") cache_dir = TemporaryDirectory() - set_custom_cache_repo_name_in_hf_home(cache_repo_id, api=api) + set_custom_cache_repo_name_in_hf_home( + cache_repo_id, api=api + ) # The custom repo will be registered under `HF_HOME`, we need to restore the env by the end of each test. assert load_custom_cache_repo_name_from_hf_home() == cache_repo_id cache_path = cache_dir.name # Modify environment to force neuronx cache to use temporary caches @@ -70,6 +74,12 @@ def cache_repos(): os.environ[var] = previous_env[var] +def unset_custom_cache_repo_name_in_hf_home(hf_home: str = HF_HOME): + hf_home_cache_repo_file = f"{hf_home}/{CACHE_REPO_FILENAME}" + if os.path.isfile(hf_home_cache_repo_file): + os.remove(hf_home_cache_repo_file) + + def export_decoder_model(model_id): batch_size = 2 sequence_length = 512 @@ -196,6 +206,7 @@ def test_decoder_cache(cache_repos): check_decoder_generation(model) # Verify the local cache directory has not been populated assert len(get_local_cached_files(cache_path, "neff")) == 0 + unset_custom_cache_repo_name_in_hf_home() @is_inferentia_test @@ -227,6 +238,7 @@ def test_encoder_cache(cache_repos): check_encoder_inference(model, tokenizer) # Verify the local cache directory has not been populated assert len(get_local_cached_files(cache_path, ".neuron")) == 0 + unset_custom_cache_repo_name_in_hf_home() @is_inferentia_test @@ -257,6 +269,7 @@ def test_stable_diffusion_cache(cache_repos): check_stable_diffusion_inference(model) # Verify the local cache directory has not been populated assert len(get_local_cached_files(cache_path, ".neuron")) == 0 + unset_custom_cache_repo_name_in_hf_home() @is_inferentia_test @@ -271,6 +284,7 @@ def test_stable_diffusion_cache(cache_repos): ids=["invalid_repo", "invalid_endpoint", "invalid_token"], ) def test_decoder_cache_unavailable(cache_repos, var, value, match): + unset_custom_cache_repo_name_in_hf_home() # clean the repo set by cli since it's prioritized than env variable # Modify the specified environment variable to trigger an error os.environ[var] = value # Just exporting the model will only emit a warning @@ -301,3 +315,4 @@ def test_optimum_neuron_cli_cache_synchronize(cache_repos): stdout = stdout.decode("utf-8") assert p.returncode == 0 assert f"1 entrie(s) found in cache for {model_id}" in stdout + unset_custom_cache_repo_name_in_hf_home()