Fix broken tests (#274)

* Fix tests * Fixes test_runner * Skip generate tests as long as #262 has not been merged * Wont fail if secondary caches dont not have write access * Remove support for old neuron compilation cache naming * Update test_trainium_common.yml * Add cleanups to CI * Experiment with new mixin class * Remove comment in workflow * Skipping GPTNeoX test as it is flaky --------- Co-authored-by: Guillaume LEGENDRE <[email protected]>
huggingface · Nov 14, 2023 · 50e31a5 · 50e31a5
1 parent b9a1ef2
commit 50e31a5
Show file tree

Hide file tree

Showing 23 changed files with 471 additions and 496 deletions.
diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
@@ -16,45 +16,8 @@ concurrency:
 
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }} 
-      EC2_INSTANCE_TYPE: trn1.2xlarge 
-      EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
-      EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
-      EC2_IAM_ROLE: optimum-ec2-github-actions-role
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          iam-role-name: ${{ env.EC2_IAM_ROLE }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-optimum-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
   optimum-neuron-tests:
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
     env:
       AWS_REGION: us-east-1
       TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
@@ -63,35 +26,14 @@ jobs:
         uses: actions/checkout@v2
       # - name: Install python3.8-venv
       #   run: sudo apt update; sudo apt install -y python3.8-venv
+      - name: Setup PATH
+        run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
       - name: Set pip repository pointing to the Neuron repository
         run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
       - name: Install Python dependencies
         run: pip install .[tests,neuronx]
       - name: Run tests on Neuron cores
         run: |
-          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests 
+          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
       - name: Run staging tests on Neuron cores 
-        run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - optimum-neuron-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} 
+        run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
@@ -16,77 +16,20 @@ concurrency:
 
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-      EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }} 
-      EC2_INSTANCE_TYPE: trn1.32xlarge 
-      EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
-      EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
-      EC2_IAM_ROLE: optimum-ec2-github-actions-role
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ${{ env.EC2_AMI_ID }}
-          ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
-          subnet-id: ${{ env.EC2_SUBNET_ID }}
-          security-group-id: ${{ env.EC2_SECURITY_GROUP }}
-          iam-role-name: ${{ env.EC2_IAM_ROLE }}
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "Name", "Value": "ec2-optimum-github-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
   optimum-neuron-tests:
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci] 
     env:
       AWS_REGION: us-east-1
     steps:
       - name: Checkout
         uses: actions/checkout@v2
+      - name: Setup PATH
+        run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
       - name: Set pip repository pointing to the Neuron repository
         run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
       - name: Install Python dependencies
         run: pip install .[tests,neuronx]
       - name: Run tests on Neuron cores
         run: |
-          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/ 
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner
-      - optimum-neuron-tests
-    runs-on: ubuntu-latest
-    env:
-      AWS_REGION: us-east-1
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ env.AWS_REGION }}
-      - name: Stop EC2 runner
-        uses: philschmid/philschmid-ec2-github-runner@main
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} 
+          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/
+  
diff --git a/optimum/neuron/distributed/base.py b/optimum/neuron/distributed/base.py
@@ -15,7 +15,6 @@
 """Base class related to `neuronx_distributed` to perform parallelism."""
 
 import contextlib
-import gc
 import shutil
 from abc import ABC, abstractclassmethod
 from dataclasses import asdict
@@ -534,7 +533,7 @@ def load_model_sharded_checkpoint(cls, model: "PreTrainedModel", load_dir: Union
 
         if not isinstance(load_dir, Path):
             load_dir = Path(load_dir)
-        parallel_layers.load(load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model=model, sharded=True)
+        parallel_layers.load(load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=model, sharded=True)
 
     @classmethod
     def load_model_checkpoint(cls, model: "PreTrainedModel", load_dir: Union[str, Path]):
@@ -560,37 +559,10 @@ def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", l
                 "It is not possible to load a sharded optimizer checkpoint when using ZeRO-1 yet."
             )
 
+        from neuronx_distributed.parallel_layers import load
+
         if not isinstance(load_dir, Path):
             load_dir = Path(load_dir)
-
-        import torch_xla.core.xla_model as xm
-        from neuronx_distributed.parallel_layers.parallel_state import (
-            get_pipeline_model_parallel_rank,
-            get_tensor_model_parallel_rank,
-            get_tensor_model_parallel_size,
+        load(
+            load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=optimizer, model_key="optimizer_state_dict"
         )
-
-        world_size = get_tensor_model_parallel_size()
-        tp_rank = get_tensor_model_parallel_rank()
-        pp_rank = get_pipeline_model_parallel_rank()
-
-        if not (load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME).is_dir():
-            raise FileNotFoundError(f"Could not find a sharded checkpoint directory under {load_dir.as_posix()}.")
-
-        checkpoint_name = load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME / f"tp_rank_{tp_rank:02d}_pp_rank{pp_rank:02d}.pt"
-
-        device = "xla"
-        for group in optimizer.param_groups:
-            for p in group["params"]:
-                device = p.device
-                break
-
-        for worker_start in range(0, world_size):
-            if tp_rank == worker_start:
-                checkpoint = torch.load(checkpoint_name, map_location="cpu")
-                optimizer_state_dict = checkpoint["optimizer_state_dict"]
-                xm.send_cpu_data_to_device(optimizer_state_dict, device)
-                optimizer.load_state_dict(optimizer_state_dict)
-                del checkpoint
-                gc.collect()
-            xm.rendezvous("neuron.load_checkpoint" + str(worker_start))
diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
@@ -169,6 +169,19 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
         if not sequence_parallel_enabled:
             return
 
+        def rotate_half(x):
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+
+        # Remove this function once Transformers >= 4.36.0 is supported.
+        def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+            cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+            sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+            q_embed = (q * cos) + (rotate_half(q) * sin)
+            k_embed = (k * cos) + (rotate_half(k) * sin)
+            return q_embed, k_embed
+
         def sequence_parallel_forward(
             self,
             hidden_states: torch.FloatTensor,
@@ -234,7 +247,7 @@ def sequence_parallel_forward(
 
             # Reshape outputs
             if sequence_parallel_enabled:
-                # [batch, seq_len, num_attention_heads, head_size] -> [seq_len, batch, hidden_size]
+                # [batch, num_attention_heads, seq_len, head_size] -> [seq_len, batch, hidden_size]
                 attn_output = attn_output.permute(2, 0, 1, 3).contiguous()
                 attn_output = attn_output.view(*attn_output.shape[:2], -1)
             else:

diff --git a/optimum/neuron/distributed/utils.py b/optimum/neuron/distributed/utils.py
@@ -528,6 +528,7 @@ def from_pretrained_for_tp(
         token=token,
         revision=revision,
         use_safetensors=use_safetensors,
+        use_safetensors_in_priority=True,
         convert_to_safetensors=True,
         **kwargs,
     )

diff --git a/optimum/neuron/trainer_callback.py b/optimum/neuron/trainer_callback.py
@@ -27,26 +27,25 @@
 import torch
 from transformers import TrainerCallback, TrainerState
 
-from optimum.neuron.utils.training_utils import is_precompilation
-
 from ..utils import logging
 from .utils import is_torch_xla_available
 from .utils.cache_utils import (
-    NEURON_COMPILE_CACHE_NAME,
     NeuronHash,
     download_cached_model_from_hub,
-    follows_new_cache_naming_convention,
     get_neuron_cache_path,
     list_files_in_neuron_cache,
     path_after_folder,
     push_to_cache_on_hub,
     set_neuron_cache_path,
 )
+from .utils.training_utils import is_precompilation
 
 
 if TYPE_CHECKING:
     from transformers import PreTrainedModel, TrainerControl, TrainingArguments
 
+    from .training_args import NeuronTrainingArguments
+
 
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
@@ -108,14 +107,13 @@ def __init__(
         else:
             self.tmp_neuron_cache_path = tmp_neuron_cache
 
-        if self.tmp_neuron_cache_path.name != NEURON_COMPILE_CACHE_NAME:
-            self.tmp_neuron_cache_path = self.tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME
-
         self.tmp_neuron_cache_state = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
         self.fetch_files = set()
 
+        # Keys are of format:
+        # (model, input_shapes, data_type, tensor_parallel_size)
         self.neuron_hashes: Dict[
-            Tuple["PreTrainedModel", Tuple[Tuple[str, Tuple[int]], ...], torch.dtype], NeuronHash
+            Tuple["PreTrainedModel", Tuple[Tuple[str, Tuple[int]], ...], torch.dtype, int], NeuronHash
         ] = {}
         self.neuron_hash_to_files: Dict[NeuronHash, List[Path]] = defaultdict(list)
 
@@ -169,14 +167,8 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
         else:
             neuron_cache_files = []
 
-        if follows_new_cache_naming_convention():
-            tmp_neuron_cache_path = tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME
-            set_neuron_cache_path(tmp_neuron_cache_path)
-        else:
-            set_neuron_cache_path(tmp_neuron_cache_path)
-            tmp_neuron_cache_path = tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME
-
-        tmp_neuron_cache_path.mkdir()
+        # Setting the Neuron compilation cache to be the temporary Neuron compilation cache.
+        set_neuron_cache_path(tmp_neuron_cache_path)
 
         cache_stats_exists = False
         if neuron_cache_path is not None:
@@ -188,8 +180,6 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
             if cache_file.name == "cache_stats.json":
                 continue
             path_in_neuron_cache = path_after_folder(cache_file, neuron_cache_path.name)
-            if NEURON_COMPILE_CACHE_NAME in path_in_neuron_cache.parts:
-                path_in_neuron_cache = path_after_folder(path_in_neuron_cache, NEURON_COMPILE_CACHE_NAME)
             tmp_cache_file = tmp_neuron_cache_path / path_in_neuron_cache
             tmp_cache_file.parent.mkdir(parents=True, exist_ok=True)
             # TODO: investigate why it is needed. Minor issue.
@@ -206,7 +196,7 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
 
     def neuron_hash_for_model(
         self,
-        args: "TrainingArguments",
+        args: "NeuronTrainingArguments",
         model: "PreTrainedModel",
         inputs: Dict[str, Any],
         try_to_fetch_cached_model: bool = False,
@@ -240,17 +230,13 @@ def full_path_to_path_in_temporary_cache(self, path: Path):
     def try_to_fetch_cached_model(self, neuron_hash: NeuronHash) -> bool:
         # TODO: needs to be called ONLY when absolutely needed.
         files_before_fetching = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
-        cache_path = neuron_hash.cache_path
-
-        def path_in_repo_to_path_in_target_directory(path):
-            # The last part of cache_path is the overall hash.
-            return Path(neuron_hash.neuron_compiler_version_dir_name) / path_after_folder(path, cache_path.name)
 
         found_in_cache = download_cached_model_from_hub(
             neuron_hash,
             target_directory=self.tmp_neuron_cache_path,
-            path_in_repo_to_path_in_target_directory=path_in_repo_to_path_in_target_directory,
+            path_in_repo_to_path_in_target_directory="default",
         )
+
         if found_in_cache:
             files_after_fetching = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
             diff = [f for f in files_after_fetching if f not in files_before_fetching]
@@ -277,15 +263,8 @@ def synchronize_temporary_neuron_cache_state(self) -> List[Path]:
 
     def synchronize_temporary_neuron_cache(self):
         for neuron_hash, files in self.neuron_hash_to_files.items():
-
-            def local_path_to_path_in_repo(path):
-                if follows_new_cache_naming_convention():
-                    return path_after_folder(path, f"neuronxcc-{neuron_hash.neuron_compiler_version}")
-                else:
-                    return path_after_folder(path, f"USER_neuroncc-{neuron_hash.neuron_compiler_version}")
-
             for path in files:
-                push_to_cache_on_hub(neuron_hash, path, local_path_to_path_in_repo=local_path_to_path_in_repo)
+                push_to_cache_on_hub(neuron_hash, path, local_path_to_path_in_repo="default")
                 if self.use_neuron_cache:
                     path_in_cache = self.full_path_to_path_in_temporary_cache(path)
                     target_file = self.neuron_cache_path / path_in_cache