diff --git a/optimum/commands/export/neuron.py b/optimum/commands/export/neuron.py
index 402498e3b..5172fdb54 100644
--- a/optimum/commands/export/neuron.py
+++ b/optimum/commands/export/neuron.py
@@ -68,6 +68,11 @@ def parse_args_neuron(parser: "ArgumentParser"):
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
     optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--disable_neuron_cache",
+        action="store_true",
+        help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
+    )
     optional_group.add_argument(
         "--trust-remote-code",
         action="store_true",
diff --git a/optimum/commands/export/neuronx.py b/optimum/commands/export/neuronx.py
index d54bcbde3..127904e37 100644
--- a/optimum/commands/export/neuronx.py
+++ b/optimum/commands/export/neuronx.py
@@ -83,7 +83,7 @@ def parse_args_neuronx(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--disable_neuron_cache",
         action="store_true",
-        help="Whether to disable automatic caching of compiled TorchScript models (not applicable for JIT compilation).",
+        help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).",
     )
     optional_group.add_argument(
         "--trust-remote-code",
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 747d0b438..e787ce16c 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -465,7 +465,7 @@ def export_neuronx(
     config: "NeuronDefaultConfig",
     output: Path,
     compiler_workdir: Optional[Path] = None,
-    inline_weights_to_neff: bool = True,
+    inline_weights_to_neff: bool = False,
     optlevel: str = "2",
     auto_cast: Optional[str] = None,
     auto_cast_type: str = "bf16",
@@ -482,7 +482,7 @@ def export_neuronx(
             Directory to store the exported Neuron model.
         compiler_workdir (`Optional[Path]`, defaults to `None`):
             The directory used by neuronx-cc, where you can find intermediary outputs (neff, weight, hlo...).
-        inline_weights_to_neff (`bool`, defaults to `True`):
+        inline_weights_to_neff (`bool`, defaults to `False`):
             Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
         optlevel (`str`, defaults to `"2"`):
             The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2".
@@ -546,6 +546,12 @@ def export_neuronx(
     # diffusers specific
     compiler_args = add_stable_diffusion_compiler_args(config, compiler_args)
 
+    if config.dynamic_batch_size is True and not inline_weights_to_neff:
+        logger.warning(
+            "Dynamic batching is not yet compatible with the weights/neff non-inlined model. `inline_weights_to_neff` is set to True. If you still want to separate the neff and weights, please set `dynamic_batch_size=False`."
+        )
+        inline_weights_to_neff = True
+
     neuron_model = neuronx.trace(
         checked_model,
         dummy_inputs_tuple,
@@ -556,10 +562,6 @@ def export_neuronx(
     )
 
     if config.dynamic_batch_size is True:
-        if not inline_weights_to_neff:
-            raise ValueError(
-                "Dynamic batching is not yet compatible with the weights/neff non-inlined model. Please set `dynamic_batch_size=False` or `inline_weights_to_neff=True`."
-            )
         neuron_model = neuronx.dynamic_batch(neuron_model)
 
     # diffusers specific
@@ -608,7 +610,7 @@ def export_neuron(
     config: "NeuronDefaultConfig",
     output: Path,
     compiler_workdir: Optional[Path] = None,
-    inline_weights_to_neff: bool = True,
+    inline_weights_to_neff: bool = False,
     auto_cast: Optional[str] = None,
     auto_cast_type: str = "bf16",
     disable_fast_relayout: bool = False,
@@ -626,7 +628,7 @@ def export_neuron(
             Directory to store the exported Neuron model.
         compiler_workdir (`Optional[Path]`, defaults to `None`):
             The directory used by neuron-cc, where you can find intermediary outputs (neff, weight, hlo...).
-        inline_weights_to_neff (`bool`, defaults to `True`):
+        inline_weights_to_neff (`bool`, defaults to `False`):
             Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
         auto_cast (`Optional[str]`, defaults to `None`):
             Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `None`, `"matmul"` or `"all"`, you should use `None` to disable any auto-casting, use `"matmul"` to cast FP32 matrix multiplication operations, and use `"all"` to cast all FP32 operations.
@@ -666,6 +668,12 @@ def export_neuron(
     checked_model = config.patch_model_for_export(model, dummy_inputs)
     compiler_args = convert_neuronx_compiler_args_to_neuron(auto_cast, auto_cast_type, disable_fast_relayout)
 
+    if config.dynamic_batch_size is True and not inline_weights_to_neff:
+        logger.warning(
+            "Dynamic batching is not yet compatible with the weights/neff non-inlined model. `inline_weights_to_neff` is set to True. If you still want to separate the neff and weights, please set `dynamic_batch_size=False`."
+        )
+        inline_weights_to_neff = True
+
     neuron_model = neuron.trace(
         checked_model,
         dummy_inputs_tuple,
diff --git a/optimum/neuron/utils/hub_neuronx_cache.py b/optimum/neuron/utils/hub_neuronx_cache.py
index 94f91ebd2..99daa944f 100644
--- a/optimum/neuron/utils/hub_neuronx_cache.py
+++ b/optimum/neuron/utils/hub_neuronx_cache.py
@@ -152,10 +152,7 @@ def exists(self, path: str):
         if self.default_cache.exists(path):
             return True
         rel_path = self._rel_path(path)
-        file_exists = self.api.file_exists(self.repo_id, rel_path)
-        folder_info = self.api.list_repo_tree(self.repo_id, rel_path)
-        folder_exists = len(list(folder_info)) > 1
-        exists = file_exists or folder_exists
+        exists = self.api.file_exists(self.repo_id, rel_path)
         if not exists:
             logger.warning(
                 f"{rel_path} not found in {self.repo_id}: the corresponding graph will be recompiled."
@@ -184,7 +181,7 @@ def download_folder(self, folder_path: str, dst_path: str):
                 folder_info = list(self.api.list_repo_tree(self.repo_id, rel_folder_path))
                 folder_exists = len(folder_info) > 1
             except Exception as e:
-                logger.warning(f"{rel_folder_path} not found in {self.repo_id}: {e} \nThe model will be recompiled.")
+                logger.info(f"{rel_folder_path} not found in {self.repo_id}: {e} \nThe model will be recompiled.")
                 folder_exists = False
 
             if folder_exists:
@@ -239,7 +236,7 @@ def download_file_to_string(self, filename: str, limit: int = None):
 def get_hub_cache():
     HUB_CACHE = "aws-neuron/optimum-neuron-cache"
     custom_hub_cache = load_custom_cache_repo_name_from_hf_home()
-    if custom_hub_cache is not None:
+    if custom_hub_cache is not None and len(custom_hub_cache) > 0:
         return custom_hub_cache
     else:
         return os.getenv("CUSTOM_CACHE_REPO", HUB_CACHE)
@@ -351,7 +348,7 @@ def hf_create_compile_cache(cache_url):
                 # Create cache entry in local cache: it can be later synchronized with the hub cache
                 registry_path = default_cache.get_cache_dir_with_cache_key(registry_folder)
                 model_type = entry.config["model_type"]
-                entry_path = f"{registry_path}/{model_type}/{entry.model_id}"  # TODO: this is not applicable for checkpoints with multiple models, eg. stable diffusion
+                entry_path = f"{registry_path}/{model_type}/{entry.model_id}"
                 config_path = f"{entry_path}/{entry.hash}.json"
                 if not default_cache.exists(config_path):
                     oldmask = os.umask(000)
@@ -549,9 +546,14 @@ def build_cache_config(
         config = exclude_white_list_from_config(config, white_list, neuron_white_list)
         clean_configs[name] = config
 
-    if "unet" in configs:
-        clean_configs["model_type"] = "stable-diffusion"
     if len(clean_configs) > 1:
+        if "unet" in configs:
+            # stable diffusion
+            clean_configs["model_type"] = "stable-diffusion"
+        else:
+            # seq-to-seq
+            clean_configs["model_type"] = next(iter(clean_configs.values()))["model_type"]
+
         return clean_configs
     else:
         return next(iter(clean_configs.values()))
diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py
index 1c400bef8..84ee0d669 100644
--- a/tests/cache/test_neuronx_cache.py
+++ b/tests/cache/test_neuronx_cache.py
@@ -29,6 +29,8 @@
 from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSequenceClassification, NeuronStableDiffusionPipeline
 from optimum.neuron.utils import get_hub_cached_entries, synchronize_hub_cache
 from optimum.neuron.utils.cache_utils import (
+    CACHE_REPO_FILENAME,
+    HF_HOME,
     load_custom_cache_repo_name_from_hf_home,
     set_custom_cache_repo_name_in_hf_home,
 )
@@ -48,7 +50,9 @@ def cache_repos():
     cache_repo_id = api.create_repo(cache_repo_id, private=True).repo_id
     api.repo_info(cache_repo_id, repo_type="model")
     cache_dir = TemporaryDirectory()
-    set_custom_cache_repo_name_in_hf_home(cache_repo_id, api=api)
+    set_custom_cache_repo_name_in_hf_home(
+        cache_repo_id, api=api
+    )  # The custom repo will be registered under `HF_HOME`, we need to restore the env by the end of each test.
     assert load_custom_cache_repo_name_from_hf_home() == cache_repo_id
     cache_path = cache_dir.name
     # Modify environment to force neuronx cache to use temporary caches
@@ -70,6 +74,12 @@ def cache_repos():
             os.environ[var] = previous_env[var]
 
 
+def unset_custom_cache_repo_name_in_hf_home(hf_home: str = HF_HOME):
+    hf_home_cache_repo_file = f"{hf_home}/{CACHE_REPO_FILENAME}"
+    if os.path.isfile(hf_home_cache_repo_file):
+        os.remove(hf_home_cache_repo_file)
+
+
 def export_decoder_model(model_id):
     batch_size = 2
     sequence_length = 512
@@ -196,6 +206,7 @@ def test_decoder_cache(cache_repos):
     check_decoder_generation(model)
     # Verify the local cache directory has not been populated
     assert len(get_local_cached_files(cache_path, "neff")) == 0
+    unset_custom_cache_repo_name_in_hf_home()
 
 
 @is_inferentia_test
@@ -227,6 +238,7 @@ def test_encoder_cache(cache_repos):
     check_encoder_inference(model, tokenizer)
     # Verify the local cache directory has not been populated
     assert len(get_local_cached_files(cache_path, ".neuron")) == 0
+    unset_custom_cache_repo_name_in_hf_home()
 
 
 @is_inferentia_test
@@ -257,6 +269,7 @@ def test_stable_diffusion_cache(cache_repos):
     check_stable_diffusion_inference(model)
     # Verify the local cache directory has not been populated
     assert len(get_local_cached_files(cache_path, ".neuron")) == 0
+    unset_custom_cache_repo_name_in_hf_home()
 
 
 @is_inferentia_test
@@ -271,6 +284,7 @@ def test_stable_diffusion_cache(cache_repos):
     ids=["invalid_repo", "invalid_endpoint", "invalid_token"],
 )
 def test_decoder_cache_unavailable(cache_repos, var, value, match):
+    unset_custom_cache_repo_name_in_hf_home()  # clean the repo set by cli since it's prioritized than env variable
     # Modify the specified environment variable to trigger an error
     os.environ[var] = value
     # Just exporting the model will only emit a warning
@@ -301,3 +315,4 @@ def test_optimum_neuron_cli_cache_synchronize(cache_repos):
     stdout = stdout.decode("utf-8")
     assert p.returncode == 0
     assert f"1 entrie(s) found in cache for {model_id}" in stdout
+    unset_custom_cache_repo_name_in_hf_home()