Fix test

huggingface · Mar 20, 2024 · c47e7a2 · c47e7a2
1 parent b3d10e1
commit c47e7a2
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 6 deletions.
diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
@@ -495,6 +495,8 @@ def prepare_model(
 
         # We do not want to use the cache here as it would imply more communication that we do not need.
         model.config.use_cache = False
+        model.config.output_attentions = False
+        model.config.output_hidden_states = False
 
         if self.distributed_type is NeuronDistributedType.XLA_FSDP:
             return self.prepare_model_for_xla_fsdp(

diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py
@@ -24,7 +24,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional, Set, Tuple, Type, Union
 
 import torch
-from transformers import PreTrainedModel
+from transformers import PreTrainedModel, PretrainedConfig
 from transformers.utils import WEIGHTS_NAME
 
 from ...utils import logging
@@ -588,11 +588,6 @@ def should_parallelize_layer_predicate_func(layer):
             names = {parameter_to_name[p] for p in layer.parameters()}
             return names < names_of_the_parameters_to_consider
 
-        # It solves some compilation issues.
-        # Investigate if using the cache becomes needed.
-        # Note: it is mandatory to set it to False when using pipeline parallelism.
-        model.config.use_cache = False
-
         if tp_size > 1:
             # TODO: remove that once it is solved on the `neuronx_distributed` side.
             try:
@@ -688,6 +683,7 @@ def should_parallelize_layer_predicate_func(layer):
             if not cls.supports_pipeline_parallelism():
                 raise NotImplementedError("{cls} does not support pipeline parallelism.")
 
+            model.config.use_cache = False
             model.config.return_dict = False
             model.config.output_attentions = False
             model.config.output_hidden_states = False

diff --git a/tests/distributed/test_model_parallelization.py b/tests/distributed/test_model_parallelization.py
@@ -299,6 +299,9 @@ def _parallel_model_matches_original_model(
         )
         orig_model = NeuronAccelerator.patch_model_for_neuron(orig_model)
 
+        # TODO: enable that again once it's working, seems to be an AWS issue.
+        orig_model.config.use_cache = False
+
         set_neuron_cc_optlevel_for_model(orig_model)
 
         move_model_to_device(orig_model, xm.xla_device())

diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
@@ -17,6 +17,7 @@
 import json
 import logging
 import os
+import pytest
 import random
 from dataclasses import FrozenInstanceError
 from pathlib import Path
@@ -483,6 +484,7 @@ def test_neuron_hash_is_private(self):
 
 @is_trainium_test
 @is_staging_test
+@pytest.skip("This is not needed anymore and will be removed.")
 class CachedModelOnTheHubTestCase(StagingTestMixin, TestCase):
     def test_push_to_hub_fails_with_private_model_and_public_repo(self):
         with TemporaryDirectory() as tmpdirname: