Skip to content

Commit

Permalink
Fix broken tests (#274)
Browse files Browse the repository at this point in the history
* Fix tests

* Fixes test_runner

* Skip generate tests as long as #262 has not been merged

* Wont fail if secondary caches dont not have write access

* Remove support for old neuron compilation cache naming

* Update test_trainium_common.yml

* Add cleanups to CI

* Experiment with new mixin class

* Remove comment in workflow

* Skipping GPTNeoX test as it is flaky

---------

Co-authored-by: Guillaume LEGENDRE <[email protected]>
  • Loading branch information
michaelbenayoun and glegendre01 authored Nov 14, 2023
1 parent b9a1ef2 commit 50e31a5
Show file tree
Hide file tree
Showing 23 changed files with 471 additions and 496 deletions.
68 changes: 5 additions & 63 deletions .github/workflows/test_trainium_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,45 +16,8 @@ concurrency:


jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }}
EC2_INSTANCE_TYPE: trn1.2xlarge
EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
EC2_IAM_ROLE: optimum-ec2-github-actions-role
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
iam-role-name: ${{ env.EC2_IAM_ROLE }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-optimum-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
optimum-neuron-tests:
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
Expand All @@ -63,35 +26,14 @@ jobs:
uses: actions/checkout@v2
# - name: Install python3.8-venv
# run: sudo apt update; sudo apt install -y python3.8-venv
- name: Setup PATH
run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
- name: Set pip repository pointing to the Neuron repository
run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Run tests on Neuron cores
run: |
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
- name: Run staging tests on Neuron cores
run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- optimum-neuron-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests -s
67 changes: 5 additions & 62 deletions .github/workflows/test_trainium_distributed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,77 +16,20 @@ concurrency:


jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }}
EC2_INSTANCE_TYPE: trn1.32xlarge
EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
EC2_IAM_ROLE: optimum-ec2-github-actions-role
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
iam-role-name: ${{ env.EC2_IAM_ROLE }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-optimum-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
optimum-neuron-tests:
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Setup PATH
run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
- name: Set pip repository pointing to the Neuron repository
run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Run tests on Neuron cores
run: |
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- optimum-neuron-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m "is_trainium_test" tests/distributed/
38 changes: 5 additions & 33 deletions optimum/neuron/distributed/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"""Base class related to `neuronx_distributed` to perform parallelism."""

import contextlib
import gc
import shutil
from abc import ABC, abstractclassmethod
from dataclasses import asdict
Expand Down Expand Up @@ -534,7 +533,7 @@ def load_model_sharded_checkpoint(cls, model: "PreTrainedModel", load_dir: Union

if not isinstance(load_dir, Path):
load_dir = Path(load_dir)
parallel_layers.load(load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model=model, sharded=True)
parallel_layers.load(load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=model, sharded=True)

@classmethod
def load_model_checkpoint(cls, model: "PreTrainedModel", load_dir: Union[str, Path]):
Expand All @@ -560,37 +559,10 @@ def load_optimizer_sharded_checkpoint(cls, optimizer: "torch.optim.Optimizer", l
"It is not possible to load a sharded optimizer checkpoint when using ZeRO-1 yet."
)

from neuronx_distributed.parallel_layers import load

if not isinstance(load_dir, Path):
load_dir = Path(load_dir)

import torch_xla.core.xla_model as xm
from neuronx_distributed.parallel_layers.parallel_state import (
get_pipeline_model_parallel_rank,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_size,
load(
load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME, model_or_optimizer=optimizer, model_key="optimizer_state_dict"
)

world_size = get_tensor_model_parallel_size()
tp_rank = get_tensor_model_parallel_rank()
pp_rank = get_pipeline_model_parallel_rank()

if not (load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME).is_dir():
raise FileNotFoundError(f"Could not find a sharded checkpoint directory under {load_dir.as_posix()}.")

checkpoint_name = load_dir / TENSOR_PARALLEL_SHARDS_DIR_NAME / f"tp_rank_{tp_rank:02d}_pp_rank{pp_rank:02d}.pt"

device = "xla"
for group in optimizer.param_groups:
for p in group["params"]:
device = p.device
break

for worker_start in range(0, world_size):
if tp_rank == worker_start:
checkpoint = torch.load(checkpoint_name, map_location="cpu")
optimizer_state_dict = checkpoint["optimizer_state_dict"]
xm.send_cpu_data_to_device(optimizer_state_dict, device)
optimizer.load_state_dict(optimizer_state_dict)
del checkpoint
gc.collect()
xm.rendezvous("neuron.load_checkpoint" + str(worker_start))
15 changes: 14 additions & 1 deletion optimum/neuron/distributed/decoder_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,19 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
if not sequence_parallel_enabled:
return

def rotate_half(x):
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)

# Remove this function once Transformers >= 4.36.0 is supported.
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
q_embed = (q * cos) + (rotate_half(q) * sin)
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed

def sequence_parallel_forward(
self,
hidden_states: torch.FloatTensor,
Expand Down Expand Up @@ -234,7 +247,7 @@ def sequence_parallel_forward(

# Reshape outputs
if sequence_parallel_enabled:
# [batch, seq_len, num_attention_heads, head_size] -> [seq_len, batch, hidden_size]
# [batch, num_attention_heads, seq_len, head_size] -> [seq_len, batch, hidden_size]
attn_output = attn_output.permute(2, 0, 1, 3).contiguous()
attn_output = attn_output.view(*attn_output.shape[:2], -1)
else:
Expand Down
1 change: 1 addition & 0 deletions optimum/neuron/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ def from_pretrained_for_tp(
token=token,
revision=revision,
use_safetensors=use_safetensors,
use_safetensors_in_priority=True,
convert_to_safetensors=True,
**kwargs,
)
Expand Down
45 changes: 12 additions & 33 deletions optimum/neuron/trainer_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,25 @@
import torch
from transformers import TrainerCallback, TrainerState

from optimum.neuron.utils.training_utils import is_precompilation

from ..utils import logging
from .utils import is_torch_xla_available
from .utils.cache_utils import (
NEURON_COMPILE_CACHE_NAME,
NeuronHash,
download_cached_model_from_hub,
follows_new_cache_naming_convention,
get_neuron_cache_path,
list_files_in_neuron_cache,
path_after_folder,
push_to_cache_on_hub,
set_neuron_cache_path,
)
from .utils.training_utils import is_precompilation


if TYPE_CHECKING:
from transformers import PreTrainedModel, TrainerControl, TrainingArguments

from .training_args import NeuronTrainingArguments


if is_torch_xla_available():
import torch_xla.core.xla_model as xm
Expand Down Expand Up @@ -108,14 +107,13 @@ def __init__(
else:
self.tmp_neuron_cache_path = tmp_neuron_cache

if self.tmp_neuron_cache_path.name != NEURON_COMPILE_CACHE_NAME:
self.tmp_neuron_cache_path = self.tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME

self.tmp_neuron_cache_state = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
self.fetch_files = set()

# Keys are of format:
# (model, input_shapes, data_type, tensor_parallel_size)
self.neuron_hashes: Dict[
Tuple["PreTrainedModel", Tuple[Tuple[str, Tuple[int]], ...], torch.dtype], NeuronHash
Tuple["PreTrainedModel", Tuple[Tuple[str, Tuple[int]], ...], torch.dtype, int], NeuronHash
] = {}
self.neuron_hash_to_files: Dict[NeuronHash, List[Path]] = defaultdict(list)

Expand Down Expand Up @@ -169,14 +167,8 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
else:
neuron_cache_files = []

if follows_new_cache_naming_convention():
tmp_neuron_cache_path = tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME
set_neuron_cache_path(tmp_neuron_cache_path)
else:
set_neuron_cache_path(tmp_neuron_cache_path)
tmp_neuron_cache_path = tmp_neuron_cache_path / NEURON_COMPILE_CACHE_NAME

tmp_neuron_cache_path.mkdir()
# Setting the Neuron compilation cache to be the temporary Neuron compilation cache.
set_neuron_cache_path(tmp_neuron_cache_path)

cache_stats_exists = False
if neuron_cache_path is not None:
Expand All @@ -188,8 +180,6 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
if cache_file.name == "cache_stats.json":
continue
path_in_neuron_cache = path_after_folder(cache_file, neuron_cache_path.name)
if NEURON_COMPILE_CACHE_NAME in path_in_neuron_cache.parts:
path_in_neuron_cache = path_after_folder(path_in_neuron_cache, NEURON_COMPILE_CACHE_NAME)
tmp_cache_file = tmp_neuron_cache_path / path_in_neuron_cache
tmp_cache_file.parent.mkdir(parents=True, exist_ok=True)
# TODO: investigate why it is needed. Minor issue.
Expand All @@ -206,7 +196,7 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem

def neuron_hash_for_model(
self,
args: "TrainingArguments",
args: "NeuronTrainingArguments",
model: "PreTrainedModel",
inputs: Dict[str, Any],
try_to_fetch_cached_model: bool = False,
Expand Down Expand Up @@ -240,17 +230,13 @@ def full_path_to_path_in_temporary_cache(self, path: Path):
def try_to_fetch_cached_model(self, neuron_hash: NeuronHash) -> bool:
# TODO: needs to be called ONLY when absolutely needed.
files_before_fetching = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
cache_path = neuron_hash.cache_path

def path_in_repo_to_path_in_target_directory(path):
# The last part of cache_path is the overall hash.
return Path(neuron_hash.neuron_compiler_version_dir_name) / path_after_folder(path, cache_path.name)

found_in_cache = download_cached_model_from_hub(
neuron_hash,
target_directory=self.tmp_neuron_cache_path,
path_in_repo_to_path_in_target_directory=path_in_repo_to_path_in_target_directory,
path_in_repo_to_path_in_target_directory="default",
)

if found_in_cache:
files_after_fetching = list_files_in_neuron_cache(self.tmp_neuron_cache_path, only_relevant_files=True)
diff = [f for f in files_after_fetching if f not in files_before_fetching]
Expand All @@ -277,15 +263,8 @@ def synchronize_temporary_neuron_cache_state(self) -> List[Path]:

def synchronize_temporary_neuron_cache(self):
for neuron_hash, files in self.neuron_hash_to_files.items():

def local_path_to_path_in_repo(path):
if follows_new_cache_naming_convention():
return path_after_folder(path, f"neuronxcc-{neuron_hash.neuron_compiler_version}")
else:
return path_after_folder(path, f"USER_neuroncc-{neuron_hash.neuron_compiler_version}")

for path in files:
push_to_cache_on_hub(neuron_hash, path, local_path_to_path_in_repo=local_path_to_path_in_repo)
push_to_cache_on_hub(neuron_hash, path, local_path_to_path_in_repo="default")
if self.use_neuron_cache:
path_in_cache = self.full_path_to_path_in_temporary_cache(path)
target_file = self.neuron_cache_path / path_in_cache
Expand Down
Loading

0 comments on commit 50e31a5

Please sign in to comment.