Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test_examples uses ExampleRunner #227

Merged
merged 14 commits into from
Sep 14, 2023
29 changes: 19 additions & 10 deletions .github/workflows/test_trainium_examples.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
name: Optimum Neuron - Example scripts test on Trainium
name: Optimum Neuron - Test Example Scripts

on:
push:
branches:
- main
schedule:
# At the end of everyday
- cron: 0 22 * * *
workflow_dispatch:
inputs:
priority:
description: The priority of the models to test, useful to perform filtering
options:
- all
- high
- middle
- low
required: true
model_size:
description: The size of the models to tests
options:
- regular
- tiny
required: true

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
Expand Down Expand Up @@ -55,16 +65,15 @@ jobs:
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
env:
AWS_REGION: us-east-1
RUN_TINY: ${{ github.event.inputs.model_size == "tiny" && "1" || "0" }}
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Install python3.8-venv
run: sudo apt update; sudo apt install -y python3.8-venv
- name: Run example tests on Neuron cores
run: |
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" PRIORITY=${{ github.event.inputs.priority }} RUN_TINY=$RUN_TINY RUN_SLOW=1 pytest -m "is_trainium_test" tests/test_examples.py -v
stop-runner:
name: Stop self-hosted EC2 runner
needs:
Expand Down
2 changes: 1 addition & 1 deletion optimum/commands/neuron/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
load_custom_cache_repo_name_from_hf_home,
set_custom_cache_repo_name_in_hf_home,
)
from ...neuron.utils.compilation_utils import ExampleRunner
from ...neuron.utils.runner import ExampleRunner
from ...utils import logging
from ..base import BaseOptimumCLICommand, CommandInfo

Expand Down
4 changes: 3 additions & 1 deletion optimum/neuron/trainer_callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def create_temporary_neuron_cache(cls, neuron_cache_path: Optional[Path]) -> Tem
path_in_neuron_cache = path_after_folder(path_in_neuron_cache, NEURON_COMPILE_CACHE_NAME)
tmp_cache_file = tmp_neuron_cache_path / path_in_neuron_cache
tmp_cache_file.parent.mkdir(parents=True, exist_ok=True)
tmp_cache_file.symlink_to(cache_file)
# TODO: investigate why it is needed. Minor issue.
if not tmp_cache_file.exists():
tmp_cache_file.symlink_to(cache_file)

cls._insert_in_cache_stats(cache_stats, cache_file, path_in_neuron_cache)

Expand Down
16 changes: 15 additions & 1 deletion optimum/neuron/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@
logger = logging.get_logger()


# From https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
def string_to_bool(v: Union[str, bool]) -> bool:
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise TypeError(
f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
)


def args_and_kwargs_to_kwargs_only(
f: Callable,
args: Optional[Tuple[Any, ...]] = None,
Expand Down Expand Up @@ -150,7 +164,7 @@ def convert_checkpoint_to_safetensors(
if not already_exists and (not is_distributed or is_main_process):
if log:
logger.info(f"Converting {weight_file} to safetensors")
checkpoint = torch.load(weight_file)
checkpoint = torch.load(weight_file, map_location=torch.device("cpu"))
data_pointers = set()
for k, v in checkpoint.items():
if v.data_ptr() in data_pointers:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,15 @@
from pathlib import Path
from subprocess import PIPE
from tempfile import TemporaryDirectory
from typing import List, Optional, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import requests
from huggingface_hub import HfFolder
from huggingface_hub import (
HfApi,
HfFolder,
snapshot_download,
)
from transformers import AutoConfig

from ...utils import logging
from .cache_utils import get_hf_hub_cache_repos, has_write_access_to_repo, load_custom_cache_repo_name_from_hf_home
Expand Down Expand Up @@ -131,6 +136,8 @@ class ExampleRunner:
"set_max_target_length": True,
"extra_command_line_arguments": [
"--pad_to_max_length",
"--prediction_loss_only",
{"t5": "--source_prefix 'summarize: '"},
],
},
"translation": {
Expand All @@ -143,6 +150,7 @@ class ExampleRunner:
"--target_lang en",
"--pad_to_max_length",
"--prediction_loss_only",
{"t5": "--source_prefix 'Translate Romanian to Enligsh: '"},
michaelbenayoun marked this conversation as resolved.
Show resolved Hide resolved
],
},
"image-classification": {
Expand All @@ -155,9 +163,16 @@ class ExampleRunner:
}

def __init__(
self, model_name_or_path: str, task: str, example_dir: Optional[Union[str, Path]] = None, use_venv: bool = True
self,
model_name_or_path: str,
task: str,
example_dir: Optional[Union[str, Path]] = None,
config_overrides: Optional[Dict[str, Any]] = None,
use_venv: bool = True,
install_requirements: bool = True,
):
self.model_name_or_path = model_name_or_path
self.config_overrides = config_overrides

if task not in _TASK_TO_EXAMPLE_SCRIPT:
supported_tasks = ", ".join(_TASK_TO_EXAMPLE_SCRIPT.keys())
Expand All @@ -177,6 +192,7 @@ def __init__(
self.example_dir = example_dir

self.use_venv = use_venv
self.should_install_requirements = install_requirements
self.venv_dir = TemporaryDirectory()
self.python_name = "python"
self.pip_name = "pip"
Expand Down Expand Up @@ -298,6 +314,42 @@ def check_user_logged_in_and_cache_repo_is_set(self):
f"You do not have write access to {main_repo}. Please log in and/or use a custom Tranium cache repo."
)

def download_model_repo_and_override_config(
self, model_name_or_path: str, config_overrides: Dict[str, Any], output_dir: Union[str, Path]
) -> Union[str, Path]:
if not config_overrides:
return model_name_or_path

filenames = HfApi().list_repo_files(repo_id=model_name_or_path, token=HfFolder.get_token())
safetensors_model_file_pattern = re.compile(r"\w+(-[0-9]*-of-[0-9]*)?\.safetensors")
allow_patterns = ["*.json", "*.txt"]
if any(re.match(safetensors_model_file_pattern, filename) for filename in filenames):
# Not downloading PyTorch checkpoints if safetensors checkpoints are available.
allow_patterns.append("*.bin")
else:
allow_patterns.append("*.safetensors")

directory = Path(output_dir) / model_name_or_path.split("/")[-1]

# local_dir_use_symlinks = "auto" will download big files (>= 5MB) in the cache and create symlinks in
# local_dir, while creating copies in local_dir for small files.
# Here the goal is to edit the config of the model so this solution seems optimal.
snapshot_download(
model_name_or_path, allow_patterns=allow_patterns, local_dir=directory, local_dir_use_symlinks="auto"
)

config = AutoConfig.from_pretrained(directory)

for name, value in config_overrides.items():
type_of_attribute = type(getattr(config, name))
if type(value) is not type_of_attribute:
value = type_of_attribute(value)
setattr(config, name, value)

config.save_pretrained(directory)

return directory

def run(
self,
num_cores: int,
Expand All @@ -309,9 +361,16 @@ def run(
gradient_accumulation_steps: int = 1,
num_epochs: int = 1,
max_steps: Optional[int] = None,
max_eval_samples: Optional[int] = None,
logging_steps: int = 1,
save_steps: int = -1,
learning_rate: float = 1e-4,
tensor_parallel_size: int = 1,
disable_embedding_parallelization: bool = False,
zero_1: bool = False,
output_dir: Optional[Union[Path, str]] = None,
do_precompilation: bool = False,
print_outputs: bool = False,
) -> Tuple[int, str, str]:
if num_cores <= 0 or num_cores > 32:
raise ValueError("The number of Neuron cores to use must be between 1 and 32.")
Expand All @@ -338,21 +397,32 @@ def run(
script_path = candidates[0]

# Installing requirements if needed.
self.install_requirements(script_path.parent / "requirements.txt")
if self.should_install_requirements:
self.install_requirements(script_path.parent / "requirements.txt")

cmd = []

cmd.append(self.python_name if num_cores == 1 else f"{self.torchrun_name} --nproc_per_node {num_cores}")
cmd.append(script_path.as_posix())
cmd.append(f"--model_name_or_path {self.model_name_or_path}")

model_name_or_path = self.model_name_or_path
if self.config_overrides is not None:
model_name_or_path = self.download_model_repo_and_override_config(
self.model_name_or_path, self.config_overrides, tmpdir.name
)
cmd.append(f"--model_name_or_path {model_name_or_path}")

# Training steps and batch sizes.
cmd.append(f"--num_train_epochs {num_epochs}")
max_steps_idx = -1
if max_steps is not None:
cmd.append(f"--max_steps {max_steps}")
max_steps_idx = len(cmd) - 1
cmd.append("--do_train")
if do_eval:
cmd.append("--do_eval")
if max_eval_samples is not None:
cmd.append("--max_eval_samples {max_eval_samples}")
cmd.append(f"--learning_rate {learning_rate}")
cmd.append(f"--per_device_train_batch_size {train_batch_size}")
if do_eval:
Expand All @@ -369,11 +439,20 @@ def run(
cmd.append(f"--save_steps {save_steps}")
cmd.append("--save_total_limit 1")

# Parallelism
if tensor_parallel_size > 1:
cmd.append(f"--tensor_parallel_size {tensor_parallel_size}")
if disable_embedding_parallelization:
cmd.append("--disable_embedding_parallelization")
if zero_1:
cmd.append("--zero_1")

if precision is Precision.bf16:
cmd.append("--bf16")

# Dataset
arguments = self._TASK_TO_COMMAND_ARGUMENTS[self.task]
model_type = AutoConfig.from_pretrained(model_name_or_path).model_type
for name, value in arguments.items():
if name == "set_max_length":
if isinstance(sequence_length, (tuple, list)):
Expand All @@ -391,7 +470,10 @@ def run(
cmd.append(f"--max_target_length {sequence_length[1]}")
elif name == "extra_command_line_arguments":
for argument in value:
cmd.append(argument)
if isinstance(argument, dict):
argument = argument.get(model_type, argument.get("default", None))
if argument is not None:
cmd.append(argument)
else:
cmd.append(f"--{name} {value}")

Expand All @@ -400,16 +482,48 @@ def split_args_and_value_in_command(cmd: List[str]) -> List[str]:
return [x for y in cmd for x in re.split(pattern, y) if x]

with TemporaryDirectory() as tmpdirname:
cmd.append(f"--output_dir {tmpdirname}")
if output_dir is None:
cmd.append(f"--output_dir {tmpdirname}")
else:
cmd.append(f"--output_dir {output_dir}")

if do_precompilation:
# We need to update both the number of steps and the output directory specifically for the
# precompilation step.
with TemporaryDirectory() as precompilation_tmpdirname:
precompilation_cmd = list(cmd)
precompilation_cmd.pop(-1) # Removing the --output_dir argument.
max_steps_cmd_str = "--max_steps 10"
if max_steps_idx >= 0:
precompilation_cmd[max_steps_idx] = max_steps_cmd_str
else:
precompilation_cmd.append(max_steps_cmd_str)
precompilation_cmd.append(f"--output_dir {precompilation_tmpdirname}")
precompilation_cmd = ["neuron_parallel_compile"] + precompilation_cmd

precompilation_cmd = split_args_and_value_in_command(precompilation_cmd)

print(f"RUNNING PRECOMPILATION COMMAND:\n{' '.join(precompilation_cmd)}")

proc = subprocess.Popen(precompilation_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
stdout = stdout.decode("utf-8")
stderr = stderr.decode("utf-8")

if print_outputs:
print(f"Precompilation standard output:\n{stdout}")
print(f"Precompilation standard error:\n{stderr}")

cmd = split_args_and_value_in_command(cmd)

print(f"RUNNING COMMAND:\n{' '.join(cmd)}")

proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
stdout = stdout.decode("utf-8")
stderr = stderr.decode("utf-8")
if print_outputs:
print(f"Standard output:\n{stdout}")
print(f"Standard error:\n{stderr}")

tmpdir.cleanup()

Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"sentencepiece",
"datasets",
"sacremoses",
"diffusers >= 0.20.0",
"diffusers==0.20.2",
"safetensors",
]

Expand Down Expand Up @@ -58,7 +58,7 @@
"transformers-neuronx>=0.6.106",
"torch==1.13.1.*",
"torchvision==0.14.*",
"neuronx_distributed >= 0.3.0",
"neuronx_distributed>=0.3.0",
],
"diffusers": ["diffusers"],
}
Expand Down
Loading
Loading