diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml index 1d7fb93d9..6b648c9ae 100644 --- a/.github/workflows/test_trainium_examples.yml +++ b/.github/workflows/test_trainium_examples.yml @@ -4,7 +4,8 @@ on: workflow_dispatch: inputs: coverage: - description: The coverage of the models to test, useful to perform filtering + description: Coverage + type: choice options: - all - high @@ -12,7 +13,8 @@ on: - low required: true model_size: - description: The size of the models to tests + description: Size of models + type: choice options: - regular - tiny diff --git a/optimum/neuron/utils/cache_utils.py b/optimum/neuron/utils/cache_utils.py index 5a7d069fd..e43a91eb7 100644 --- a/optimum/neuron/utils/cache_utils.py +++ b/optimum/neuron/utils/cache_utils.py @@ -44,6 +44,7 @@ from ...utils import logging from ...utils.logging import warn_once from .constant import NEURON_BINARIES_PATH +from .misc import string_to_bool from .version_utils import get_neuronxcc_version @@ -82,6 +83,16 @@ _NEW_CACHE_NAMING_CONVENTION_NEURONXCC_VERSION = "2.7.0.40+f7c6cf2a3" +# For testing purposes. +_DISABLE_IS_PRIVATE_REPO_CHECK: bool = string_to_bool( + os.environ.get("OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK", "false") +) +if _DISABLE_IS_PRIVATE_REPO_CHECK: + logger.warning( + "The check that prevents you from pushing compiled files from private models is disabled. This is allowed only " + "for testing purposes." + ) + def follows_new_cache_naming_convention(neuronxcc_version: Optional[str] = None) -> bool: """ @@ -139,6 +150,8 @@ def create_custom_cache_repo(repo_id: str = CACHE_REPO_NAME, private: bool = Tru def is_private_repo(repo_id: str) -> bool: + if _DISABLE_IS_PRIVATE_REPO_CHECK: + return False HfApi().list_repo_files(repo_id=repo_id, token=HfFolder.get_token()) private = False try: diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py index 420103e96..3b163fd6f 100644 --- a/optimum/neuron/utils/runner.py +++ b/optimum/neuron/utils/runner.py @@ -344,9 +344,9 @@ def download_model_repo_and_override_config( allow_patterns = ["*.json", "*.txt"] if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames): # Not downloading PyTorch checkpoints if safetensors checkpoints are available. - allow_patterns.append("*.bin") - else: allow_patterns.append("*.safetensors") + else: + allow_patterns.append("*.bin") directory = Path(output_dir) / model_name_or_path.split("/")[-1] @@ -390,6 +390,7 @@ def run( output_dir: Optional[Union[Path, str]] = None, do_precompilation: bool = False, print_outputs: bool = False, + _disable_is_private_model_repo_check: bool = False, ) -> Tuple[int, str]: if num_cores <= 0 or num_cores > 32: raise ValueError("The number of Neuron cores to use must be between 1 and 32.") @@ -419,6 +420,14 @@ def run( if self.should_install_requirements: self.install_requirements(script_path.parent / "requirements.txt") + def compute_max_train_samples( + max_steps: int, num_cores: int, tensor_parallel_size: int, per_device_train_batch_size: int + ) -> int: + total_batch_size = (num_cores // tensor_parallel_size) * per_device_train_batch_size + total_num_samples = max_steps * total_batch_size + # Adding 10% more examples just to make sure. + return int(total_num_samples * 1.1) + cmd = [] cmd.append(self.python_name if num_cores == 1 else f"{self.torchrun_name} --nproc_per_node {num_cores}") @@ -437,6 +446,9 @@ def run( if max_steps is not None: cmd.append(f"--max_steps {max_steps}") max_steps_idx = len(cmd) - 1 + max_train_samples = compute_max_train_samples(max_steps, num_cores, tensor_parallel_size, train_batch_size) + cmd.append(f"--max_train_samples {max_train_samples}") + cmd.append("--do_train") if do_eval: cmd.append("--do_eval") @@ -506,6 +518,10 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]: else: cmd.append(f"--output_dir {output_dir}") + env = dict(os.environ) + if _disable_is_private_model_repo_check: + env["OPTIMUM_NEURON_DISABLE_IS_PRIVATE_REPO_CHECK"] = "true" + if do_precompilation: # We need to update both the number of steps and the output directory specifically for the # precompilation step. @@ -513,10 +529,17 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]: precompilation_cmd = list(cmd) precompilation_cmd.pop(-1) # Removing the --output_dir argument. max_steps_cmd_str = "--max_steps 10" + max_train_samples = compute_max_train_samples( + 10, num_cores, tensor_parallel_size, train_batch_size + ) + max_train_samples_cmd = f"--max_train_samples {max_train_samples}" if max_steps_idx >= 0: precompilation_cmd[max_steps_idx] = max_steps_cmd_str + precompilation_cmd[max_steps_idx + 1] = max_train_samples_cmd else: precompilation_cmd.append(max_steps_cmd_str) + precompilation_cmd.append(max_train_samples_cmd) + precompilation_cmd.append(f"--output_dir {precompilation_tmpdirname}") precompilation_cmd = ["neuron_parallel_compile"] + precompilation_cmd @@ -527,7 +550,9 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]: if print_outputs: returncode, stdout = run_command_with_realtime_output(precompilation_cmd) else: - proc = subprocess.Popen(precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen( + precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env + ) stdout, _ = proc.communicate() stdout = stdout.decode("utf-8") returncode = proc.returncode @@ -538,7 +563,7 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]: if print_outputs: returncode, stdout = run_command_with_realtime_output(cmd) else: - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) stdout, _ = proc.communicate() stdout = stdout.decode("utf-8") returncode = proc.returncode diff --git a/tests/test_examples.py b/tests/test_examples.py index 05964940a..bed75b4ec 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -83,9 +83,9 @@ class Coverage(str, Enum): ALL = "all" +USE_VENV = string_to_bool(os.environ.get("USE_VENV", "false")) COVERAGE = Coverage(os.environ.get("COVERAGE", "all")) RUN_TINY = string_to_bool(os.environ.get("RUN_TINY", "false")) -USE_VENV = string_to_bool(os.environ.get("USE_VENV", "true")) MODELS_TO_TEST_MAPPING = { "albert": ( @@ -173,7 +173,12 @@ class Coverage(str, Enum): Coverage.MIDDLE, {"encoder_layers": 2, "decoder_layers": 2}, ), - # TODO: Llama + "llama": ( + "NousResearch/Llama-2-7b-hf", + TPSupport.FULL, + Coverage.HIGH, + {"num_hidden_layers": 2}, + ), # "wav2vec2": "facebook/wav2vec2-base", # Remaning: XLNet, Deberta-v2, MPNet, CLIP } @@ -181,7 +186,7 @@ class Coverage(str, Enum): def _get_supported_models_for_script( models_to_test: Dict[str, str], task_mapping: Dict[str, str], to_exclude: Optional[Set[str]] = None -) -> List[str]: +) -> List[Tuple[str, str, TPSupport, Dict[str, Any]]]: """ Filters models that can perform the task from models_to_test. """ @@ -210,7 +215,7 @@ def _get_supported_models_for_script( "run_mlm": _get_supported_models_for_script(MODELS_TO_TEST_MAPPING, MODEL_FOR_MASKED_LM_MAPPING), "run_swag": _get_supported_models_for_script(MODELS_TO_TEST_MAPPING, MODEL_FOR_MULTIPLE_CHOICE_MAPPING), "run_qa": _get_supported_models_for_script( - MODELS_TO_TEST_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, to_exclude={"bart"} + MODELS_TO_TEST_MAPPING, MODEL_FOR_QUESTION_ANSWERING_MAPPING, to_exclude={"gpt2", "gpt_neo", "bart", "t5"} ), "run_summarization": _get_supported_models_for_script( MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, to_exclude={"marian", "m2m_100"} @@ -219,10 +224,10 @@ def _get_supported_models_for_script( MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING ), "run_glue": _get_supported_models_for_script( - MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, to_exclude={"bart", "gpt2", "gpt_neo"} + MODELS_TO_TEST_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, to_exclude={"gpt2", "gpt_neo", "bart", "t5"} ), "run_ner": _get_supported_models_for_script( - MODELS_TO_TEST_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, to_exclude={"gpt2"} + MODELS_TO_TEST_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, to_exclude={"gpt2", "gpt_neo"} ), "run_image_classification": _get_supported_models_for_script( MODELS_TO_TEST_MAPPING, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING @@ -301,15 +306,26 @@ def parse_loss_from_log(log: str) -> List[float]: return losses @staticmethod - def check_that_loss_is_decreasing(losses: List[float], steps: int) -> Tuple[bool, List[float], List[float]]: - mean_losses = [] - num_mean_losses = len(losses) // steps - for i in range(num_mean_losses): - mean = sum(losses[i * steps : (i + 1) * steps]) / steps - mean_losses.append(mean) - - expected_mean_losses = sorted(mean_losses, reverse=True) - return mean_losses == expected_mean_losses, mean_losses, expected_mean_losses + def check_that_loss_is_decreasing( + losses: List[float], window_size: int, allowed_miss_rate: float = 0.1 + ) -> Tuple[bool, List[float]]: + def moving_average(values: List[float], window_size: int): + averages = [] + n = len(values) + for i in range(n - window_size + 1): + window = values[i : i + window_size] + averages.append(sum(window) / window_size) + return averages + + moving_average_losses = moving_average(losses, window_size) + num_losses = len(moving_average_losses) + num_misses = 0 + num_misses_allowed = int(num_losses * allowed_miss_rate) + for x, y in zip(moving_average_losses[:-1], moving_average_losses[1:]): + if x > y: + num_misses += 1 + + return num_misses <= num_misses_allowed, moving_average_losses @classmethod def _create_test( @@ -369,25 +385,32 @@ def test(self): output_dir=tmpdirname, do_precompilation=True, print_outputs=True, + _disable_is_private_model_repo_check=True, ) assert returncode == 0 if self.CHECK_THAT_LOSS_IS_DECREASING: losses = ExampleTestMeta.parse_loss_from_log(stdout) - is_decreasing, mean_losses, expected_mean_losses = ExampleTestMeta.check_that_loss_is_decreasing( - losses, 50 + allowed_miss_rate = 0.20 + is_decreasing, moving_average_losses = ExampleTestMeta.check_that_loss_is_decreasing( + # The loss might stagnate at some point, so we only check that the first 200 losses are + # decreasing on average. + losses[200:], + 4, + allowed_miss_rate=allowed_miss_rate, ) self.assertTrue( - is_decreasing, f"Expected mean losses to be {expected_mean_losses} but got {mean_losses}" + is_decreasing, + f"The moving average loss does not decrease as expected: {moving_average_losses} (allowed miss " + "rate = {allowed_miss_rate})", ) - if self.DO_EVAL: + if not RUN_TINY and self.DO_EVAL: with open(Path(tmpdirname) / "all_results.json") as fp: results = json.load(fp) - eval_score_threshold = ( - self.EVAL_SCORE_THRESHOLD if not RUN_TINY else self.EVAL_SCORE_THRESHOLD_FOR_TINY + eval_score_threshold = ExampleTestMeta.process_class_attribute( + self.EVAL_SCORE_THRESHOLD, model_type ) - eval_score_threshold = ExampleTestMeta.process_class_attribute(eval_score_threshold, model_type) if self.EVAL_SCORE_GREATER_IS_BETTER: self.assertGreaterEqual(float(results[self.SCORE_NAME]), eval_score_threshold) else: @@ -411,21 +434,19 @@ class ExampleTesterBase(TestCase): TRAIN_BATCH_SIZE: TypeOrDictOfType[int] = 2 EVAL_BATCH_SIZE: TypeOrDictOfType[int] = 2 GRADIENT_ACCUMULATION_STEPS: TypeOrDictOfType[int] = 1 - SEQUENCE_LENGTH: TypeOrDictOfType[Optional[Union[int, Tuple[int, int]]]] = None + SEQUENCE_LENGTH: TypeOrDictOfType[Optional[Union[int, Tuple[int, int], List[int]]]] = None NUM_CORES: int = 32 LOGGING_STEPS: int = 1 SAVE_STEPS: int = 200 TRAIN_LOSS_THRESHOLD: float - TRAIN_LOSS_THRESHOLD_FOR_TINY: float CHECK_THAT_LOSS_IS_DECREASING: TypeOrDictOfType[bool] = True # Camembert is pretrained on French. DO_EVAL: TypeOrDictOfType[bool] MAX_EVAL_SAMPLES: Optional[int] = None EVAL_SCORE_THRESHOLD: TypeOrDictOfType[float] - EVAL_SCORE_THRESHOLD_FOR_TINY: TypeOrDictOfType[float] EVAL_SCORE_GREATER_IS_BETTER: bool SCORE_NAME: str @@ -440,7 +461,6 @@ class CausalLMExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, exampl SEQUENCE_LENGTH = 512 TRAIN_LOSS_THRESHOLD = 1.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 2.5 DO_EVAL = False MAX_EVAL_SAMPLES = 200 @@ -454,13 +474,11 @@ class TextClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestMe SEQUENCE_LENGTH = 128 TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 # Camembert is pretrained on French. DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5} - EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5} EVAL_SCORE_GREATER_IS_BETTER = True SCORE_NAME = "eval_accuracy" @@ -476,13 +494,11 @@ class TokenClassificationExampleTester(ExampleTesterBase, metaclass=ExampleTestM SEQUENCE_LENGTH = 384 TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 # Camembert is pretrained on French. DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5} - EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5} EVAL_SCORE_GREATER_IS_BETTER = True SCORE_NAME = "eval_accuracy" @@ -497,13 +513,11 @@ class MultipleChoiceExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, SEQUENCE_LENGTH = 512 TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 # Camembert is pretrained on French. DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5, "distilbert": 0.645} - EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5, "distilbert": 0.645} EVAL_SCORE_GREATER_IS_BETTER = True SCORE_NAME = "eval_accuracy" @@ -517,12 +531,10 @@ class QuestionAnsweringExampleTester(ExampleTesterBase, metaclass=ExampleTestMet EVAL_BATCH_SIZE = 2 TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = {"default": 0.75, "camembert": 0.5} - EVAL_SCORE_THRESHOLD_FOR_TINY = {"default": 0.75, "camembert": 0.5} EVAL_SCORE_GREATER_IS_BETTER = True SCORE_NAME = "eval_f1" @@ -537,12 +549,10 @@ class SummarizationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, e SEQUENCE_LENGTH = {"default": [1024, 200], "t5": [768, 200]} TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = 30 - EVAL_SCORE_THRESHOLD_FOR_TINY = 30 SCORE_NAME = "eval_rougeLsum" @@ -558,7 +568,6 @@ class TranslationExampleTester(ExampleTesterBase, metaclass=ExampleTestMeta, exa DO_EVAL = False MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = 22 - EVAL_SCORE_THRESHOLD_FOR_TINY = 20 SCORE_NAME = "eval_bleu" @@ -573,12 +582,10 @@ class ImageClassificationExampleTester( EVAL_BATCH_SIZE = 2 TRAIN_LOSS_THRESHOLD = 0.5 - TRAIN_LOSS_THRESHOLD_FOR_TINY = 0.5 DO_EVAL = False # TODO: Evaluation is broken. MAX_EVAL_SAMPLES = 200 EVAL_SCORE_THRESHOLD = 0.8 - EVAL_SCORE_THRESHOLD_FOR_TINY = 0.70 EVAL_SCORE_GREATER_IS_BETTER = True SCORE_NAME = "eval_accuracy"