Skip to content

Commit

Permalink
Update tests/test_examples.py for AWS team (#242)
Browse files Browse the repository at this point in the history
* Minor changes to test examples

* Update example

* Update default value for USE_VENV

* Add llama

* Remove comment
  • Loading branch information
michaelbenayoun authored Sep 22, 2023
1 parent 0c5167f commit 4e23bc0
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 45 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/test_trainium_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@ on:
workflow_dispatch:
inputs:
coverage:
description: The coverage of the models to test, useful to perform filtering
description: Coverage
type: choice
options:
- all
- high
- middle
- low
required: true
model_size:
description: The size of the models to tests
description: Size of models
type: choice
options:
- regular
- tiny
Expand Down
13 changes: 13 additions & 0 deletions optimum/neuron/utils/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from ...utils import logging
from ...utils.logging import warn_once
from .constant import NEURON_BINARIES_PATH
from .misc import string_to_bool
from .version_utils import get_neuronxcc_version


Expand Down Expand Up @@ -82,6 +83,16 @@

_NEW_CACHE_NAMING_CONVENTION_NEURONXCC_VERSION = "2.7.0.40+f7c6cf2a3"

# For testing purposes.
_DISABLE_IS_PRIVATE_REPO_CHECK: bool = string_to_bool(
os.environ.get("OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK", "false")
)
if _DISABLE_IS_PRIVATE_REPO_CHECK:
logger.warning(
"The check that prevents you from pushing compiled files from private models is disabled. This is allowed only "
"for testing purposes."
)


def follows_new_cache_naming_convention(neuronxcc_version: Optional[str] = None) -> bool:
"""
Expand Down Expand Up @@ -139,6 +150,8 @@ def create_custom_cache_repo(repo_id: str = CACHE_REPO_NAME, private: bool = Tru


def is_private_repo(repo_id: str) -> bool:
if _DISABLE_IS_PRIVATE_REPO_CHECK:
return False
HfApi().list_repo_files(repo_id=repo_id, token=HfFolder.get_token())
private = False
try:
Expand Down
33 changes: 29 additions & 4 deletions optimum/neuron/utils/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,9 +344,9 @@ def download_model_repo_and_override_config(
allow_patterns = ["*.json", "*.txt"]
if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames):
# Not downloading PyTorch checkpoints if safetensors checkpoints are available.
allow_patterns.append("*.bin")
else:
allow_patterns.append("*.safetensors")
else:
allow_patterns.append("*.bin")

directory = Path(output_dir) / model_name_or_path.split("/")[-1]

Expand Down Expand Up @@ -390,6 +390,7 @@ def run(
output_dir: Optional[Union[Path, str]] = None,
do_precompilation: bool = False,
print_outputs: bool = False,
_disable_is_private_model_repo_check: bool = False,
) -> Tuple[int, str]:
if num_cores <= 0 or num_cores > 32:
raise ValueError("The number of Neuron cores to use must be between 1 and 32.")
Expand Down Expand Up @@ -419,6 +420,14 @@ def run(
if self.should_install_requirements:
self.install_requirements(script_path.parent / "requirements.txt")

def compute_max_train_samples(
max_steps: int, num_cores: int, tensor_parallel_size: int, per_device_train_batch_size: int
) -> int:
total_batch_size = (num_cores // tensor_parallel_size) * per_device_train_batch_size
total_num_samples = max_steps * total_batch_size
# Adding 10% more examples just to make sure.
return int(total_num_samples * 1.1)

cmd = []

cmd.append(self.python_name if num_cores == 1 else f"{self.torchrun_name} --nproc_per_node {num_cores}")
Expand All @@ -437,6 +446,9 @@ def run(
if max_steps is not None:
cmd.append(f"--max_steps {max_steps}")
max_steps_idx = len(cmd) - 1
max_train_samples = compute_max_train_samples(max_steps, num_cores, tensor_parallel_size, train_batch_size)
cmd.append(f"--max_train_samples {max_train_samples}")

cmd.append("--do_train")
if do_eval:
cmd.append("--do_eval")
Expand Down Expand Up @@ -506,17 +518,28 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]:
else:
cmd.append(f"--output_dir {output_dir}")

env = dict(os.environ)
if _disable_is_private_model_repo_check:
env["OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK"] = "true"

if do_precompilation:
# We need to update both the number of steps and the output directory specifically for the
# precompilation step.
with TemporaryDirectory() as precompilation_tmpdirname:
precompilation_cmd = list(cmd)
precompilation_cmd.pop(-1) # Removing the --output_dir argument.
max_steps_cmd_str = "--max_steps 10"
max_train_samples = compute_max_train_samples(
10, num_cores, tensor_parallel_size, train_batch_size
)
max_train_samples_cmd = f"--max_train_samples {max_train_samples}"
if max_steps_idx >= 0:
precompilation_cmd[max_steps_idx] = max_steps_cmd_str
precompilation_cmd[max_steps_idx + 1] = max_train_samples_cmd
else:
precompilation_cmd.append(max_steps_cmd_str)
precompilation_cmd.append(max_train_samples_cmd)

precompilation_cmd.append(f"--output_dir {precompilation_tmpdirname}")
precompilation_cmd = ["neuron_parallel_compile"] + precompilation_cmd

Expand All @@ -527,7 +550,9 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]:
if print_outputs:
returncode, stdout = run_command_with_realtime_output(precompilation_cmd)
else:
proc = subprocess.Popen(precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
proc = subprocess.Popen(
precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env
)
stdout, _ = proc.communicate()
stdout = stdout.decode("utf-8")
returncode = proc.returncode
Expand All @@ -538,7 +563,7 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]:
if print_outputs:
returncode, stdout = run_command_with_realtime_output(cmd)
else:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
stdout, _ = proc.communicate()
stdout = stdout.decode("utf-8")
returncode = proc.returncode
Expand Down
85 changes: 46 additions & 39 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ class Coverage(str, Enum):
ALL = "all"


USE_VENV = string_to_bool(os.environ.get("USE_VENV", "false"))
COVERAGE = Coverage(os.environ.get("COVERAGE", "all"))
RUN_TINY = string_to_bool(os.environ.get("RUN_TINY", "false"))
USE_VENV = string_to_bool(os.environ.get("USE_VENV", "true"))

MODELS_TO_TEST_MAPPING = {
"albert": (
Expand Down Expand Up @@ -173,15 +173,20 @@ class Coverage(str, Enum):
Coverage.MIDDLE,
{"encoder_layers": 2, "decoder_layers": 2},
),
# TODO: Llama
"llama": (
"NousResearch/Llama-2-7b-hf",
TPSupport.FULL,
Coverage.HIGH,
{"num_hidden_layers": 2},
),
# "wav2vec2": "facebook/wav2vec2-base",
# Remaning: XLNet, Deberta-v2, MPNet, CLIP
}


def _get_supported_models_for_script(
models_to_test: Dict[str, str], task_mapping: Dict[str, str], to_exclude: Optional[Set[str]] = None
) -> List[str]:
) -> List[Tuple[str, str, TPSupport, Dict[str, Any]]]:
"""
Filters models that can perform the task from models_to_test.
"""
Expand Down Expand Up @@ -210,7 +215,7 @@ def _get_supported_models_for_script(
"run_mlm": _get_supported_models_for_script(MODELS_TO_TEST_MAPPING, MODEL_FOR_MASKED_LM_MAPPING),
"run_swag": _get_supported_models_for_script(MODELS_TO_TEST_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING),
"run_qa": _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, to_exclude={"bart"}
MODELS_TO_TEST_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, to_exclude={"gpt2", "gpt_neo", "bart", "t5"}
),
"run_summarization": _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, to_exclude={"marian", "m2m_100"}
Expand All @@ -219,10 +224,10 @@ def _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
),
"run_glue": _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, to_exclude={"bart", "gpt2", "gpt_neo"}
MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, to_exclude={"gpt2", "gpt_neo", "bart", "t5"}
),
"run_ner": _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, to_exclude={"gpt2"}
MODELS_TO_TEST_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, to_exclude={"gpt2", "gpt_neo"}
),
"run_image_classification": _get_supported_models_for_script(
MODELS_TO_TEST_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
Expand Down Expand Up @@ -301,15 +306,26 @@ def parse_loss_from_log(log: str) -> List[float]:
return losses

@staticmethod
def check_that_loss_is_decreasing(losses: List[float], steps: int) -> Tuple[bool, List[float], List[float]]:
mean_losses = []
num_mean_losses = len(losses) // steps
for i in range(num_mean_losses):
mean = sum(losses[i * steps : (i + 1) * steps]) / steps
mean_losses.append(mean)

expected_mean_losses = sorted(mean_losses, reverse=True)
return mean_losses == expected_mean_losses, mean_losses, expected_mean_losses
def check_that_loss_is_decreasing(
losses: List[float], window_size: int, allowed_miss_rate: float = 0.1
) -> Tuple[bool, List[float]]:
def moving_average(values: List[float], window_size: int):
averages = []
n = len(values)
for i in range(n - window_size + 1):
window = values[i : i + window_size]
averages.append(sum(window) / window_size)
return averages

moving_average_losses = moving_average(losses, window_size)
num_losses = len(moving_average_losses)
num_misses = 0
num_misses_allowed = int(num_losses * allowed_miss_rate)
for x, y in zip(moving_average_losses[:-1], moving_average_losses[1:]):
if x > y:
num_misses += 1

return num_misses <= num_misses_allowed, moving_average_losses

@classmethod
def _create_test(
Expand Down Expand Up @@ -369,25 +385,32 @@ def test(self):
output_dir=tmpdirname,
do_precompilation=True,
print_outputs=True,
_disable_is_private_model_repo_check=True,
)
assert returncode == 0

if self.CHECK_THAT_LOSS_IS_DECREASING:
losses = ExampleTestMeta.parse_loss_from_log(stdout)
is_decreasing, mean_losses, expected_mean_losses = ExampleTestMeta.check_that_loss_is_decreasing(
losses, 50
allowed_miss_rate = 0.20
is_decreasing, moving_average_losses = ExampleTestMeta.check_that_loss_is_decreasing(
# The loss might stagnate at some point, so we only check that the first 200 losses are
# decreasing on average.
losses[200:],
4,
allowed_miss_rate=allowed_miss_rate,
)
self.assertTrue(
is_decreasing, f"Expected mean losses to be {expected_mean_losses} but got {mean_losses}"
is_decreasing,
f"The moving average loss does not decrease as expected: {moving_average_losses} (allowed miss "
"rate = {allowed_miss_rate})",
)

if self.DO_EVAL:
if not RUN_TINY and self.DO_EVAL:
with open(Path(tmpdirname) / "all_results.json") as fp:
results = json.load(fp)
eval_score_threshold = (
self.EVAL_SCORE_THRESHOLD if not RUN_TINY else self.EVAL_SCORE_THRESHOLD_FOR_TINY
eval_score_threshold = ExampleTestMeta.process_class_attribute(
self.EVAL_SCORE_THRESHOLD, model_type
)
eval_score_threshold = ExampleTestMeta.process_class_attribute(eval_score_threshold, model_type)
if self.EVAL_SCORE_GREATER_IS_BETTER:
self.assertGreaterEqual(float(results[self.SCORE_NAME]), eval_score_threshold)
else:
Expand All @@ -411,21 +434,19 @@ class ExampleTesterBase(TestCase):
TRAIN_BATCH_SIZE: TypeOrDictOfType[int] = 2
EVAL_BATCH_SIZE: TypeOrDictOfType[int] = 2
GRADIENT_ACCUMULATION_STEPS: TypeOrDictOfType[int] = 1
SEQUENCE_LENGTH: TypeOrDictOfType[Optional[Union[int, Tuple[int, int]]]] = None
SEQUENCE_LENGTH: TypeOrDictOfType[Optional[Union[int, Tuple[int, int], List[int]]]] = None

NUM_CORES: int = 32
LOGGING_STEPS: int = 1
SAVE_STEPS: int = 200

TRAIN_LOSS_THRESHOLD: float
TRAIN_LOSS_THRESHOLD_FOR_TINY: float
CHECK_THAT_LOSS_IS_DECREASING: TypeOrDictOfType[bool] = True

# Camembert is pretrained on French.
DO_EVAL: TypeOrDictOfType[bool]
MAX_EVAL_SAMPLES: Optional[int] = None
EVAL_SCORE_THRESHOLD: TypeOrDictOfType[float]
EVAL_SCORE_THRESHOLD_FOR_TINY: TypeOrDictOfType[float]
EVAL_SCORE_GREATER_IS_BETTER: bool
SCORE_NAME: str

Expand All @@ -440,7 +461,6 @@ class CausalLMExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, exampl
SEQUENCE_LENGTH = 512

TRAIN_LOSS_THRESHOLD = 1.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 2.5

DO_EVAL = False
MAX_EVAL_SAMPLES = 200
Expand All @@ -454,13 +474,11 @@ class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMe
SEQUENCE_LENGTH = 128

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

# Camembert is pretrained on French.
DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_GREATER_IS_BETTER = True
SCORE_NAME = "eval_accuracy"

Expand All @@ -476,13 +494,11 @@ class TokenClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestM
SEQUENCE_LENGTH = 384

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

# Camembert is pretrained on French.
DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_GREATER_IS_BETTER = True
SCORE_NAME = "eval_accuracy"

Expand All @@ -497,13 +513,11 @@ class MultipleChoiceExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta,
SEQUENCE_LENGTH = 512

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

# Camembert is pretrained on French.
DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5, "distilbert": 0.645}
EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5, "distilbert": 0.645}
EVAL_SCORE_GREATER_IS_BETTER = True
SCORE_NAME = "eval_accuracy"

Expand All @@ -517,12 +531,10 @@ class QuestionAnsweringExampleTester(ExampleTesterBase, metaclass=ExampleTestMet
EVAL_BATCH_SIZE = 2

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5}
EVAL_SCORE_GREATER_IS_BETTER = True
SCORE_NAME = "eval_f1"

Expand All @@ -537,12 +549,10 @@ class SummarizationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, e
SEQUENCE_LENGTH = {"default": [1024, 200], "t5": [768, 200]}

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = 30
EVAL_SCORE_THRESHOLD_FOR_TINY = 30
SCORE_NAME = "eval_rougeLsum"


Expand All @@ -558,7 +568,6 @@ class TranslationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, exa
DO_EVAL = False
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = 22
EVAL_SCORE_THRESHOLD_FOR_TINY = 20
SCORE_NAME = "eval_bleu"


Expand All @@ -573,12 +582,10 @@ class ImageClassificationExampleTester(
EVAL_BATCH_SIZE = 2

TRAIN_LOSS_THRESHOLD = 0.5
TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5

DO_EVAL = False # TODO: Evaluation is broken.
MAX_EVAL_SAMPLES = 200
EVAL_SCORE_THRESHOLD = 0.8
EVAL_SCORE_THRESHOLD_FOR_TINY = 0.70
EVAL_SCORE_GREATER_IS_BETTER = True
SCORE_NAME = "eval_accuracy"

Expand Down

0 comments on commit 4e23bc0

Please sign in to comment.