Skip to content

Commit

Permalink
Refacored test
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun committed Apr 22, 2024
1 parent e86a0f1 commit 4833aab
Show file tree
Hide file tree
Showing 3 changed files with 133 additions and 178 deletions.
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def _hub_test(create_local_cache: bool = False):
set_custom_cache_repo_name_in_hf_home(custom_cache_repo_with_seed)

if create_local_cache:
yield tuple([custom_cache_repo_with_seed, local_cache_path_with_seed])
yield (custom_cache_repo_with_seed, local_cache_path_with_seed)
else:
yield custom_cache_repo_with_seed

Expand Down
170 changes: 0 additions & 170 deletions tests/distributed/test_training.py

This file was deleted.

139 changes: 132 additions & 7 deletions tests/test_trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
"""Tests related to the Trainer derived classes."""

import copy
import json
import shutil
import time
from pathlib import Path

import pytest
from datasets import load_dataset
from huggingface_hub import HfApi
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, LlamaForCausalLM

from optimum.neuron import NeuronTrainer, NeuronTrainingArguments
from optimum.neuron.distributed.utils import MODEL_PARALLEL_SHARDS_DIR_NAME
Expand Down Expand Up @@ -49,7 +51,7 @@


# LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-16layers-32kv-heads-random"
LLAMA_V2_MODEL_NAME = "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random"
MODEL_NAME = "michaelbenayoun/llama-2-tiny-4kv-heads-4layers-random"


@is_trainium_test
Expand All @@ -66,7 +68,7 @@ def test_get_model_param_count(self, parallel_sizes, tmpdir):
_, tp_size, pp_size = parallel_sizes
output_dir = Path(tmpdir)

model = get_model(LlamaForCausalLM, LLAMA_V2_MODEL_NAME, tp_size=tp_size, pp_size=pp_size)
model = get_model(LlamaForCausalLM, MODEL_NAME, tp_size=tp_size, pp_size=pp_size)

target_num_parameters = sum(p.numel() for p in model.parameters())

Expand Down Expand Up @@ -100,8 +102,8 @@ def test_save_checkpoint(self, hub_test, tmpdir, parallel_sizes):
tp_rank = get_tensor_model_parallel_rank()
pp_rank = get_pipeline_model_parallel_rank()

tokenizer = AutoTokenizer.from_pretrained(LLAMA_V2_MODEL_NAME)
model = get_model(LlamaForCausalLM, LLAMA_V2_MODEL_NAME, tp_size=tp_size, pp_size=pp_size)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = get_model(LlamaForCausalLM, MODEL_NAME, tp_size=tp_size, pp_size=pp_size)
datasets = create_dummy_causal_lm_dataset(model.config.vocab_size, 120, 1)

args = NeuronTrainingArguments(
Expand Down Expand Up @@ -176,8 +178,8 @@ def test_train_and_eval_use_remote_cache(self, hub_test_with_local_cache, tmpdir
num_eval_samples = 100
per_device_eval_batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(LLAMA_V2_MODEL_NAME)
model = get_model(LlamaForCausalLM, LLAMA_V2_MODEL_NAME, tp_size=tp_size, pp_size=pp_size)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = get_model(LlamaForCausalLM, MODEL_NAME, tp_size=tp_size, pp_size=pp_size)
clone = copy.deepcopy(model)

datasets = create_dummy_causal_lm_dataset(model.config.vocab_size, num_train_samples, num_eval_samples)
Expand Down Expand Up @@ -263,3 +265,126 @@ def test_train_and_eval_use_remote_cache(self, hub_test_with_local_cache, tmpdir
assert (
second_training_duration < first_training_duration
), "Second training should be faster because cached graphs can be used."

def test_save_and_resume_from_checkpoint(self, parallel_sizes, tmpdir):

tmpdir = Path(tmpdir)
_, tp_size, pp_size = parallel_sizes
train_batch_size = 2
eval_batch_size = 2
max_steps = 10
do_eval = True
max_train_samples = 100
max_eval_samples = 16

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

def create_training_args(output_dir, resume_from_checkpoint=None, max_steps=max_steps):
if isinstance(output_dir, Path):
output_dir = output_dir.as_posix()
if isinstance(resume_from_checkpoint, Path):
resume_from_checkpoint = resume_from_checkpoint.as_posix()
args = NeuronTrainingArguments(
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
bf16=True,
per_device_train_batch_size=train_batch_size,
per_device_eval_batch_size=eval_batch_size,
max_steps=max_steps,
logging_steps=1,
save_steps=5,
do_eval=do_eval,
output_dir=output_dir,
resume_from_checkpoint=resume_from_checkpoint,
skip_cache_push=False,
)
return args

def create_model():
config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_hidden_layers = 2 * max(1, pp_size)
config.num_attention_heads = 2
config.num_key_value_heads = 2
config.problem_type = "single_label_classification"
# config.use_cache = False
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, config=config, ignore_mismatched_sizes=True
)
return model

# First run setting.
first_output_dir = tmpdir / "first_run"
args = create_training_args(first_output_dir)
model = create_model()

# Dataset preprocessing
raw_datasets = load_dataset("glue", "sst2")
sentence1_key = "sentence"
sentence2_key = None
label_to_id = None
max_seq_length = 32
padding = "max_length"

def preprocess_function(examples):
# Tokenize the texts
args = (
(examples[sentence1_key],)
if sentence2_key is None
else (examples[sentence1_key], examples[sentence2_key])
)
result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

# Map labels to IDs (not necessary for GLUE tasks)
if label_to_id is not None and "label" in examples:
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
return result

with args.main_process_first(desc="dataset map pre-processing"):
raw_datasets = raw_datasets.map(preprocess_function, batched=True)
train_dataset = raw_datasets["train"]
train_dataset = train_dataset.select(range(max_train_samples))
eval_dataset = raw_datasets["validation"]
eval_dataset = eval_dataset.select(range(max_eval_samples))

trainer = NeuronTrainer(
model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
)

train_result = trainer.train()
trainer.evaluate()
trainer.save_metrics("train", train_result.metrics)

with open(first_output_dir / "train_results.json") as fp:
first_training_report = json.load(fp)

# Case 1: Resuming from checkpoint by specifying a checkpoint directory.
second_output_dir = tmpdir / "second_run"
resume_from_checkpoint = first_output_dir / "checkpoint-5"
args = create_training_args(second_output_dir, resume_from_checkpoint=resume_from_checkpoint)
model = create_model()
trainer = NeuronTrainer(
model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
)

train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint.as_posix())
trainer.evaluate()
trainer.save_metrics("train", train_result.metrics)

with open(first_output_dir / "train_results.json") as fp:
second_training_report = json.load(fp)

assert first_training_report["train_loss"] == second_training_report["train_loss"]

# Case 2: Resuming from checkpoint by specifying an output_dir with checkpoints.
# max_steps + 10 to do a some training steps than the previous run.
second_output_dir = first_output_dir
args = create_training_args(second_output_dir, max_steps=max_steps + 10)
model = create_model()

trainer = NeuronTrainer(
model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
)

trainer.train(resume_from_checkpoint=True)
trainer.evaluate()

0 comments on commit 4833aab

Please sign in to comment.