huggingface · JingyaHuang · May 7, 2024 · May 7, 2024 · May 7, 2024
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
@@ -499,7 +499,7 @@ def main_export(
     cache_dir: Optional[str] = None,
     disable_neuron_cache: Optional[bool] = False,
     compiler_workdir: Optional[Union[str, Path]] = None,
-    inline_weights_to_neff: bool = False,
+    inline_weights_to_neff: bool = True,
     optlevel: str = "2",
     trust_remote_code: bool = False,
     subfolder: str = "",

diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
@@ -465,7 +465,7 @@ def export_neuronx(
     config: "NeuronDefaultConfig",
     output: Path,
     compiler_workdir: Optional[Path] = None,
-    inline_weights_to_neff: bool = False,
+    inline_weights_to_neff: bool = True,
     optlevel: str = "2",
     auto_cast: Optional[str] = None,
     auto_cast_type: str = "bf16",
@@ -482,7 +482,7 @@ def export_neuronx(
             Directory to store the exported Neuron model.
         compiler_workdir (`Optional[Path]`, defaults to `None`):
             The directory used by neuronx-cc, where you can find intermediary outputs (neff, weight, hlo...).
-        inline_weights_to_neff (`bool`, defaults to `False`):
+        inline_weights_to_neff (`bool`, defaults to `True`):
             Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
         optlevel (`str`, defaults to `"2"`):
             The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2".
@@ -610,7 +610,7 @@ def export_neuron(
     config: "NeuronDefaultConfig",
     output: Path,
     compiler_workdir: Optional[Path] = None,
-    inline_weights_to_neff: bool = False,
+    inline_weights_to_neff: bool = True,
     auto_cast: Optional[str] = None,
     auto_cast_type: str = "bf16",
     disable_fast_relayout: bool = False,
@@ -628,7 +628,7 @@ def export_neuron(
             Directory to store the exported Neuron model.
         compiler_workdir (`Optional[Path]`, defaults to `None`):
             The directory used by neuron-cc, where you can find intermediary outputs (neff, weight, hlo...).
-        inline_weights_to_neff (`bool`, defaults to `False`):
+        inline_weights_to_neff (`bool`, defaults to `True`):
             Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
         auto_cast (`Optional[str]`, defaults to `None`):
             Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `None`, `"matmul"` or `"all"`, you should use `None` to disable any auto-casting, use `"matmul"` to cast FP32 matrix multiplication operations, and use `"all"` to cast all FP32 operations.

diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
@@ -238,7 +238,7 @@ def _export(
         cache_dir: Optional[str] = None,
         compiler_workdir: Optional[Union[str, Path]] = None,
         disable_neuron_cache: bool = False,
-        inline_weights_to_neff: bool = False,
+        inline_weights_to_neff: bool = True,
         optlevel: str = "2",
         subfolder: str = "",
         local_files_only: bool = False,

diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
@@ -577,7 +577,7 @@ def _export(
         cache_dir: Optional[str] = None,
         compiler_workdir: Optional[str] = None,
         disable_neuron_cache: bool = False,
-        inline_weights_to_neff: bool = False,
+        inline_weights_to_neff: bool = True,
         optlevel: str = "2",
         subfolder: str = "",
         local_files_only: bool = False,
@@ -623,7 +623,7 @@ def _export(
                 Path to a directory in which the neuron compiler will store all intermediary files during the compilation(neff, weight, hlo graph...).
             disable_neuron_cache (`bool`, defaults to `False`):
                 Whether to disable automatic caching of compiled models. If set to True, will not load neuron cache nor cache the compiled artifacts.
-            inline_weights_to_neff (`bool`, defaults to `False`):
+            inline_weights_to_neff (`bool`, defaults to `True`):
                 Whether to inline the weights to the neff graph. If set to False, weights will be seperated from the neff.
             optlevel (`str`, defaults to `"2"`):
             The level of optimization the compiler should perform. Can be `"1"`, `"2"` or `"3"`, defaults to "2".