huggingface · JingyaHuang · Apr 12, 2024 · Mar 4, 2024 · Mar 4, 2024 · Apr 8, 2024
diff --git a/.github/workflows/test_inf1_export.yml b/.github/workflows/test_inf1_export.yml
@@ -29,7 +29,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch

diff --git a/.github/workflows/test_inf1_full_export.yml b/.github/workflows/test_inf1_full_export.yml
@@ -27,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch

diff --git a/.github/workflows/test_inf1_inference.yml b/.github/workflows/test_inf1_inference.yml
@@ -29,7 +29,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch

diff --git a/.github/workflows/test_inf1_pipelines.yml b/.github/workflows/test_inf1_pipelines.yml
@@ -27,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch

diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
@@ -37,7 +37,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip

diff --git a/.github/workflows/test_inf2_export.yml b/.github/workflows/test_inf2_export.yml
@@ -37,7 +37,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip

diff --git a/.github/workflows/test_inf2_full_export.yml b/.github/workflows/test_inf2_full_export.yml
@@ -35,7 +35,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip

diff --git a/.github/workflows/test_inf2_inference.yml b/.github/workflows/test_inf2_inference.yml
@@ -37,7 +37,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip

diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -39,7 +39,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python and create venv
         run: |
-          sudo apt install python3.8-venv -y
+          sudo apt install python3.8-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip

diff --git a/benchmark/text-generation-inference/mistral-7b/tgi-results.csv b/benchmark/text-generation-inference/mistral-7b/tgi-results.csv
@@ -1,11 +1,11 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,1,34.662810045679024,0.46342812800048705,27.74296394585929
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,1,34.87827703823185,0.4793029465017753,27.654747289616235
 huggingface/mistralai/Mistral-7B-Instruct-v0.2,2,67.55520390730916,0.46188541100036673,27.32067234909958
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,4,115.9644253080536,0.4719622849997904,29.599952973112146
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,8,177.15609277817416,0.51119948700034,33.335737027419185
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,16,156.52392957214906,0.9595348704997377,86.39206521348669
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,32,247.29299604071295,2.5056241824995595,100.72862078096863
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,64,384.5781500641263,4.886728052500075,108.16498200178273
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,128,560.878982504929,10.410015015499994,130.6066071497773
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,256,623.9707062587075,23.141914837000513,190.67140038075857
-huggingface/mistralai/Mistral-7B-Instruct-v0.2,512,572.8680705363325,41.84460775000116,283.4274198954966
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,4,120.48139377787439,0.533387835999747,29.776895463051282
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,8,182.33681081540968,0.589324303500689,34.503086370812504
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,16,298.4798999555292,1.0481106424995232,41.59342073600634
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,32,362.1868809824997,2.0948955119993116,68.46259462377448
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,64,470.67410898967245,4.491813536500558,91.98977897460762
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,128,652.4156296736516,9.770283270499931,117.43685839085013
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,256,712.5097315120686,20.532419881998067,170.33580425005005
+huggingface/mistralai/Mistral-7b-Instruct-v0.2,512,663.244139330743,34.291523927000526,240.47153416154381
diff --git a/benchmark/text-generation-inference/tgi_live_metrics.py b/benchmark/text-generation-inference/tgi_live_metrics.py
@@ -1,4 +1,3 @@
-
 import requests
 from prometheus_client.parser import text_string_to_metric_families
 

diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
@@ -342,7 +342,6 @@ def export_models(
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
         try:
-
             # TODO: Remove after the weights/neff separation compilation of sdxl is patched by a neuron sdk release: https://github.com/aws-neuron/aws-neuron-sdk/issues/859
             if not inline_weights_to_neff and getattr(sub_neuron_config, "is_sdxl", False):
                 logger.warning(

diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
@@ -57,7 +57,7 @@
     patch_accelerate_is_tpu_available,
     tie_parameters,
 )
-from .utils.misc import create_patched_finfo
+from .utils.misc import apply_activation_checkpointing, create_patched_finfo
 from .utils.operations import _xla_gather
 
 
@@ -418,13 +418,24 @@ def prepare_model(
         model.config.output_attentions = False
         model.config.output_hidden_states = False
 
+        # It is needed for now otherwise sdpa is used since PT > 2.* is available.
+        for module in model.modules():
+            if getattr(module, "_use_sdpa", False):
+                module._use_sdpa = False
+            if getattr(module, "_use_flash_attention_2", False):
+                module._use_flash_attention_2 = False
+
         if self.distributed_type is NeuronDistributedType.MODEL_PARALLELISM:
-            return self._prepare_model_for_mp(
+            model = self._prepare_model_for_mp(
                 model, device_placement=device_placement, evaluation_mode=evaluation_mode
             )
-        move_model_to_device(model, xm.xla_device())
-        device_placement = False
-        return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode)
+            apply_activation_checkpointing(model)
+            return model
+        else:
+            apply_activation_checkpointing(model)
+            move_model_to_device(model, xm.xla_device())
+            device_placement = False
+            return super().prepare_model(model, device_placement=device_placement, evaluation_mode=evaluation_mode)
 
     def backward(self, loss, **kwargs):
         if self.distributed_type != DistributedType.DEEPSPEED:

diff --git a/optimum/neuron/accelerate/utils/misc.py b/optimum/neuron/accelerate/utils/misc.py
@@ -25,6 +25,8 @@
 
 
 if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
     if is_torch_neuronx_available():
         from neuronx_distributed.pipeline import NxDPPModel
 
@@ -44,7 +46,6 @@ def patch_accelerate_is_tpu_available():
 
 
 def create_patched_finfo(xla_downcast_bf16: bool = False, use_amp: bool = False, xla_use_bf16: bool = False):
-
     def patched_finfo(dtype):
         if xla_downcast_bf16 or use_amp or xla_use_bf16:
             return _ORIG_TORCH_FINFO(torch.bfloat16)
@@ -108,3 +109,28 @@ def tie_parameters(model: Union["torch.nn.Module", "NxDPPModel"], tied_parameter
         if param_to_tie is not param:
             del param_to_tie
             setattr(param_to_tie_parent_module, param_to_tie_name[1], param)
+
+
+@requires_neuronx_distributed
+def apply_activation_checkpointing(model: Union["PreTrainedModel", "NxDPPModel"]):
+    from neuronx_distributed.pipeline import NxDPPModel
+    from neuronx_distributed.utils.activation_checkpoint import (
+        apply_activation_checkpointing as nxd_apply_activation_checkpointing,
+    )
+
+    if isinstance(model, NxDPPModel):
+        modules = model.local_module.modules()
+    else:
+        modules = model.modules()
+
+    gradient_checkpointing_modules = set()
+    for module in modules:
+        if getattr(module, "gradient_checkpointing", False):
+            module.gradient_checkpointing = False
+            gradient_checkpointing_modules.add(module)
+
+    def check_fn(m: torch.nn.Module) -> bool:
+        return m in gradient_checkpointing_modules
+
+    if gradient_checkpointing_modules:
+        nxd_apply_activation_checkpointing(model, check_fn=check_fn)
diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py
@@ -44,7 +44,6 @@
     OptimumNeuronFXTracer,
     ParameterMetadata,
     WeightInformation,
-    apply_activation_checkpointing,
     get_linear_weight_info,
     get_output_projection_qualified_names_after_qga_qkv_replacement,
     get_parameter_names_mapping_after_gqa_qkv_replacement,
@@ -738,8 +737,6 @@ def should_parallelize_layer_predicate_func(layer):
                     use_zero1_optimizer=pipeline_parallel_use_zero1_optimizer,
                     tracer_cls=OptimumNeuronFXTracer,
                 )
-                if pipeline_parallel_gradient_checkpointing_enabled:
-                    apply_activation_checkpointing(model)
 
             xm.rendezvous("End of pipeline paralellism")
             if is_main_worker():

diff --git a/optimum/neuron/distributed/utils.py b/optimum/neuron/distributed/utils.py
@@ -44,7 +44,6 @@
 if is_neuronx_distributed_available():
     from neuronx_distributed.modules.qkv_linear import GQAQKVColumnParallelLinear
     from neuronx_distributed.parallel_layers import layers
-    from neuronx_distributed.pipeline import NxDPPModel
     from neuronx_distributed.pipeline.trace import HFTracerWrapper
 else:
 
@@ -1098,34 +1097,6 @@ def parameter_can_be_initialized(model: torch.nn.Module, parent_module: torch.nn
     )
 
 
-@requires_neuronx_distributed
-def apply_activation_checkpointing(
-    model: Union[torch.nn.Module, "NxDPPModel"],
-    activation_checkpoint_classes: Optional[Union[Tuple[Type[torch.nn.Module]], List[Type[torch.nn.Module]]]] = None,
-):
-    from neuronx_distributed.pipeline import NxDPPModel
-    from neuronx_distributed.utils.activation_checkpoint import apply_activation_checkpointing
-
-    if isinstance(model, NxDPPModel):
-        if activation_checkpoint_classes is not None:
-            logger.warning(
-                "Cannot specify activation checkpoint classes under pipeline parallism setting. Will use the layers "
-                f"{model.transformer_layer_cls}"
-            )
-    else:
-        # TODO support this as well.
-        raise ValueError("Not supported yet outside of the pipeline parallelism scheme.")
-
-    check_fn = None
-    if activation_checkpoint_classes is not None:
-        activation_checkpoint_classes = tuple(activation_checkpoint_classes)
-        assert len(activation_checkpoint_classes) > 0
-        assert all(issubclass(c, torch.nn.Module) for c in activation_checkpoint_classes)
-        check_fn = (lambda m: isinstance(m, activation_checkpoint_classes),)
-
-    apply_activation_checkpointing(model, check_fn=check_fn)
-
-
 @classmethod
 @requires_torch_xla
 def from_pretrained_for_mp(

diff --git a/optimum/neuron/modeling_base.py b/optimum/neuron/modeling_base.py
@@ -50,12 +50,10 @@
     from ..exporters.neuron import NeuronDefaultConfig
 
 if is_neuron_available():
-
     NEURON_COMPILER_TYPE = "neuron-cc"
     NEURON_COMPILER_VERSION = get_neuroncc_version()
 
 if is_neuronx_available():
-
     NEURON_COMPILER_TYPE = "neuronx-cc"
     NEURON_COMPILER_VERSION = get_neuronxcc_version()
 

diff --git a/optimum/neuron/utils/misc.py b/optimum/neuron/utils/misc.py
@@ -573,6 +573,7 @@ def replace_weights(
     """
     Replaces the weights in a Neuron Model with weights from another model, the original neuron model should have separated weights(by setting `inline_weights_to_neff=Talse` during the tracing).
     """
+
     if isinstance(weights, torch.nn.Module):
         weights = weights.state_dict()
 

diff --git a/setup.py b/setup.py
@@ -56,10 +56,10 @@
     "neuronx": [
         "wheel",
         "neuronx-cc==2.13.66.0",
-        "torch-neuronx==1.13.1.1.14.0",
+        "torch-neuronx==2.1.2.2.1.0",
         "transformers-neuronx==0.10.0.21",
-        "torch==1.13.1.*",
-        "torchvision==0.14.*",
+        "torch==2.1.2.*",
+        "torchvision==0.16.*",
         "neuronx_distributed==0.7.0",
     ],
     "diffusers": ["diffusers ~= 0.26.1", "peft"],