huggingface · michaelbenayoun · Sep 14, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml
@@ -1,12 +1,22 @@
-name: Optimum Neuron - Example scripts test on Trainium
+name: Optimum Neuron - Test Example Scripts
 
 on:
-  push:
-    branches:
-      - main
-  schedule:
-    # At the end of everyday
-    - cron: 0 22 * * *
+  workflow_dispatch:
+    inputs: 
+      priority: 
+        description: The priority of the models to test, useful to perform filtering
+        options:
+          - all
+          - high
+          - middle
+          - low
+        required: true
+      model_size: 
+        description: The size of the models to tests
+        options:
+          - regular
+          - tiny
+        required: true
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -55,16 +65,15 @@ jobs:
     runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
     env:
       AWS_REGION: us-east-1
+      RUN_TINY: ${{ github.event.inputs.model_size == "tiny" && "1" || "0" }}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Install Python dependencies
         run: pip install .[tests,neuronx]
-      - name: Install python3.8-venv
-        run: sudo apt update; sudo apt install -y python3.8-venv
       - name: Run example tests on Neuron cores
         run: |
-          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
+          HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" PRIORITY=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
   stop-runner:
     name: Stop self-hosted EC2 runner
     needs:

diff --git a/optimum/commands/neuron/cache.py b/optimum/commands/neuron/cache.py
@@ -24,7 +24,7 @@
     load_custom_cache_repo_name_from_hf_home,
     set_custom_cache_repo_name_in_hf_home,
 )
-from ...neuron.utils.compilation_utils import ExampleRunner
+from ...neuron.utils.runner import ExampleRunner
 from ...utils import logging
 from ..base import BaseOptimumCLICommand, CommandInfo
 

diff --git a/optimum/neuron/trainer_callback.py b/optimum/neuron/trainer_callback.py
@@ -192,7 +192,9 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
                 path_in_neuron_cache = path_after_folder(path_in_neuron_cache, NEURON_COMPILE_CACHE_NAME)
             tmp_cache_file = tmp_neuron_cache_path / path_in_neuron_cache
             tmp_cache_file.parent.mkdir(parents=True, exist_ok=True)
-            tmp_cache_file.symlink_to(cache_file)
+            # TODO: investigate why it is needed. Minor issue.
+            if not tmp_cache_file.exists():
+                tmp_cache_file.symlink_to(cache_file)
 
             cls._insert_in_cache_stats(cache_stats, cache_file, path_in_neuron_cache)
 

diff --git a/optimum/neuron/utils/misc.py b/optimum/neuron/utils/misc.py
@@ -44,6 +44,20 @@
 logger = logging.get_logger()
 
 
+# From https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+def string_to_bool(v: Union[str, bool]) -> bool:
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise TypeError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
 def args_and_kwargs_to_kwargs_only(
     f: Callable,
     args: Optional[Tuple[Any, ...]] = None,
@@ -150,7 +164,7 @@ def convert_checkpoint_to_safetensors(
     if not already_exists and (not is_distributed or is_main_process):
         if log:
             logger.info(f"Converting {weight_file} to safetensors")
-        checkpoint = torch.load(weight_file)
+        checkpoint = torch.load(weight_file, map_location=torch.device("cpu"))
         data_pointers = set()
         for k, v in checkpoint.items():
             if v.data_ptr() in data_pointers:

diff --git a/optimum/neuron/utils/compilation_utils.py → optimum/neuron/utils/runner.py b/optimum/neuron/utils/compilation_utils.py → optimum/neuron/utils/runner.py
@@ -21,10 +21,15 @@
 from pathlib import Path
 from subprocess import PIPE
 from tempfile import TemporaryDirectory
-from typing import List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
-from huggingface_hub import HfFolder
+from huggingface_hub import (
+    HfApi,
+    HfFolder,
+    snapshot_download,
+)
+from transformers import AutoConfig
 
 from ...utils import logging
 from .cache_utils import get_hf_hub_cache_repos, has_write_access_to_repo, load_custom_cache_repo_name_from_hf_home
@@ -131,6 +136,8 @@ class ExampleRunner:
             "set_max_target_length": True,
             "extra_command_line_arguments": [
                 "--pad_to_max_length",
+                "--prediction_loss_only",
+                {"t5": "--source_prefix 'summarize: '"},
             ],
         },
         "translation": {
@@ -143,6 +150,7 @@ class ExampleRunner:
                 "--target_lang en",
                 "--pad_to_max_length",
                 "--prediction_loss_only",
+                {"t5": "--source_prefix 'Translate Romanian to Enligsh: '"},
             ],
         },
         "image-classification": {
@@ -155,9 +163,16 @@ class ExampleRunner:
     }
 
     def __init__(
-        self, model_name_or_path: str, task: str, example_dir: Optional[Union[str, Path]] = None, use_venv: bool = True
+        self,
+        model_name_or_path: str,
+        task: str,
+        example_dir: Optional[Union[str, Path]] = None,
+        config_overrides: Optional[Dict[str, Any]] = None,
+        use_venv: bool = True,
+        install_requirements: bool = True,
     ):
         self.model_name_or_path = model_name_or_path
+        self.config_overrides = config_overrides
 
         if task not in _TASK_TO_EXAMPLE_SCRIPT:
             supported_tasks = ", ".join(_TASK_TO_EXAMPLE_SCRIPT.keys())
@@ -177,6 +192,7 @@ def __init__(
                 self.example_dir = example_dir
 
         self.use_venv = use_venv
+        self.should_install_requirements = install_requirements
         self.venv_dir = TemporaryDirectory()
         self.python_name = "python"
         self.pip_name = "pip"
@@ -298,6 +314,42 @@ def check_user_logged_in_and_cache_repo_is_set(self):
                 f"You do not have write access to {main_repo}. Please log in and/or use a custom Tranium cache repo."
             )
 
+    def download_model_repo_and_override_config(
+        self, model_name_or_path: str, config_overrides: Dict[str, Any], output_dir: Union[str, Path]
+    ) -> Union[str, Path]:
+        if not config_overrides:
+            return model_name_or_path
+
+        filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=HfFolder.get_token())
+        safetensors_model_file_pattern = re.compile(r"\w+(-[0-9]*-of-[0-9]*)?\.safetensors")
+        allow_patterns = ["*.json", "*.txt"]
+        if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames):
+            # Not downloading PyTorch checkpoints if safetensors checkpoints are available.
+            allow_patterns.append("*.bin")
+        else:
+            allow_patterns.append("*.safetensors")
+
+        directory = Path(output_dir) / model_name_or_path.split("/")[-1]
+
+        # local_dir_use_symlinks = "auto" will download big files (>= 5MB) in the cache and create symlinks in
+        # local_dir, while creating copies in local_dir for small files.
+        # Here the goal is to edit the config of the model so this solution seems optimal.
+        snapshot_download(
+            model_name_or_path, allow_patterns=allow_patterns, local_dir=directory, local_dir_use_symlinks="auto"
+        )
+
+        config = AutoConfig.from_pretrained(directory)
+
+        for name, value in config_overrides.items():
+            type_of_attribute = type(getattr(config, name))
+            if type(value) is not type_of_attribute:
+                value = type_of_attribute(value)
+            setattr(config, name, value)
+
+        config.save_pretrained(directory)
+
+        return directory
+
     def run(
         self,
         num_cores: int,
@@ -309,9 +361,16 @@ def run(
         gradient_accumulation_steps: int = 1,
         num_epochs: int = 1,
         max_steps: Optional[int] = None,
+        max_eval_samples: Optional[int] = None,
         logging_steps: int = 1,
         save_steps: int = -1,
         learning_rate: float = 1e-4,
+        tensor_parallel_size: int = 1,
+        disable_embedding_parallelization: bool = False,
+        zero_1: bool = False,
+        output_dir: Optional[Union[Path, str]] = None,
+        do_precompilation: bool = False,
+        print_outputs: bool = False,
     ) -> Tuple[int, str, str]:
         if num_cores <= 0 or num_cores > 32:
             raise ValueError("The number of Neuron cores to use must be between 1 and 32.")
@@ -338,21 +397,32 @@ def run(
                 script_path = candidates[0]
 
         # Installing requirements if needed.
-        self.install_requirements(script_path.parent / "requirements.txt")
+        if self.should_install_requirements:
+            self.install_requirements(script_path.parent / "requirements.txt")
 
         cmd = []
 
         cmd.append(self.python_name if num_cores == 1 else f"{self.torchrun_name} --nproc_per_node {num_cores}")
         cmd.append(script_path.as_posix())
-        cmd.append(f"--model_name_or_path {self.model_name_or_path}")
+
+        model_name_or_path = self.model_name_or_path
+        if self.config_overrides is not None:
+            model_name_or_path = self.download_model_repo_and_override_config(
+                self.model_name_or_path, self.config_overrides, tmpdir.name
+            )
+        cmd.append(f"--model_name_or_path {model_name_or_path}")
 
         # Training steps and batch sizes.
         cmd.append(f"--num_train_epochs {num_epochs}")
+        max_steps_idx = -1
         if max_steps is not None:
             cmd.append(f"--max_steps {max_steps}")
+            max_steps_idx = len(cmd) - 1
         cmd.append("--do_train")
         if do_eval:
             cmd.append("--do_eval")
+            if max_eval_samples is not None:
+                cmd.append("--max_eval_samples {max_eval_samples}")
         cmd.append(f"--learning_rate {learning_rate}")
         cmd.append(f"--per_device_train_batch_size {train_batch_size}")
         if do_eval:
@@ -369,11 +439,20 @@ def run(
         cmd.append(f"--save_steps {save_steps}")
         cmd.append("--save_total_limit 1")
 
+        # Parallelism
+        if tensor_parallel_size > 1:
+            cmd.append(f"--tensor_parallel_size {tensor_parallel_size}")
+        if disable_embedding_parallelization:
+            cmd.append("--disable_embedding_parallelization")
+        if zero_1:
+            cmd.append("--zero_1")
+
         if precision is Precision.bf16:
             cmd.append("--bf16")
 
         # Dataset
         arguments = self._TASK_TO_COMMAND_ARGUMENTS[self.task]
+        model_type = AutoConfig.from_pretrained(model_name_or_path).model_type
         for name, value in arguments.items():
             if name == "set_max_length":
                 if isinstance(sequence_length, (tuple, list)):
@@ -391,7 +470,10 @@ def run(
                     cmd.append(f"--max_target_length {sequence_length[1]}")
             elif name == "extra_command_line_arguments":
                 for argument in value:
-                    cmd.append(argument)
+                    if isinstance(argument, dict):
+                        argument = argument.get(model_type, argument.get("default", None))
+                    if argument is not None:
+                        cmd.append(argument)
             else:
                 cmd.append(f"--{name} {value}")
 
@@ -400,16 +482,48 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]:
             return [x for y in cmd for x in re.split(pattern, y) if x]
 
         with TemporaryDirectory() as tmpdirname:
-            cmd.append(f"--output_dir {tmpdirname}")
+            if output_dir is None:
+                cmd.append(f"--output_dir {tmpdirname}")
+            else:
+                cmd.append(f"--output_dir {output_dir}")
+
+            if do_precompilation:
+                # We need to update both the number of steps and the output directory specifically for the
+                # precompilation step.
+                with TemporaryDirectory() as precompilation_tmpdirname:
+                    precompilation_cmd = list(cmd)
+                    precompilation_cmd.pop(-1)  # Removing the --output_dir argument.
+                    max_steps_cmd_str = "--max_steps 10"
+                    if max_steps_idx >= 0:
+                        precompilation_cmd[max_steps_idx] = max_steps_cmd_str
+                    else:
+                        precompilation_cmd.append(max_steps_cmd_str)
+                    precompilation_cmd.append(f"--output_dir {precompilation_tmpdirname}")
+                    precompilation_cmd = ["neuron_parallel_compile"] + precompilation_cmd
+
+                    precompilation_cmd = split_args_and_value_in_command(precompilation_cmd)
+
+                    print(f"RUNNING PRECOMPILATION COMMAND:\n{' '.join(precompilation_cmd)}")
+
+                    proc = subprocess.Popen(precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    stdout, stderr = proc.communicate()
+                    stdout = stdout.decode("utf-8")
+                    stderr = stderr.decode("utf-8")
+
+                    if print_outputs:
+                        print(f"Precompilation standard output:\n{stdout}")
+                        print(f"Precompilation standard error:\n{stderr}")
 
             cmd = split_args_and_value_in_command(cmd)
-
             print(f"RUNNING COMMAND:\n{' '.join(cmd)}")
 
             proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             stdout, stderr = proc.communicate()
             stdout = stdout.decode("utf-8")
             stderr = stderr.decode("utf-8")
+            if print_outputs:
+                print(f"Standard output:\n{stdout}")
+                print(f"Standard error:\n{stderr}")
 
         tmpdir.cleanup()
 

diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@
     "sentencepiece",
     "datasets",
     "sacremoses",
-    "diffusers >= 0.20.0",
+    "diffusers==0.20.2",
     "safetensors",
 ]
 
@@ -58,7 +58,7 @@
         "transformers-neuronx>=0.6.106",
         "torch==1.13.1.*",
         "torchvision==0.14.*",
-        "neuronx_distributed >= 0.3.0",
+        "neuronx_distributed>=0.3.0",
     ],
     "diffusers": ["diffusers"],
 }