Skip to content

Commit

Permalink
Integrate new cache system for training (#472)
Browse files Browse the repository at this point in the history
* Integrate new cache system for training

* ci: reduce export and pipelines test frequency

This runs export and pipelines tests in dedicated pipelines with
stricter path filters to avoid running them on every change.

---------

Co-authored-by: David Corvoysier <[email protected]>
  • Loading branch information
michaelbenayoun and dacorvo authored Feb 16, 2024
1 parent 1b477ba commit d319856
Show file tree
Hide file tree
Showing 20 changed files with 370 additions and 224 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/test_inf1_export.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF1 export
name: Optimum neuron / Test INF1 partial export

on:
push:
Expand All @@ -18,7 +18,7 @@ concurrency:

jobs:
do-the-job:
name: Run INF1 tests
name: Run INF1 export tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
Expand Down Expand Up @@ -46,4 +46,5 @@ jobs:
- name: Run export tests
run: |
source aws_neuron_venv_pytorch/bin/activate
export MAX_EXPORT_TEST_COMBINATIONS=1
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
47 changes: 47 additions & 0 deletions .github/workflows/test_inf1_full_export.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Optimum neuron / Test INF1 full export

on:
push:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"
pull_request:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF1 full export tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install system packages
run: |
sudo apt install python3.8-venv -y
- name: Install python packages
run: |
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuron,tests]
python -m pip uninstall optimum -y
python -m pip install optimum
- name: Run CLI tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
- name: Run export tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
6 changes: 1 addition & 5 deletions .github/workflows/test_inf1_inference.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF1 inference & pipelines
name: Optimum neuron / Test INF1 inference

on:
push:
Expand Down Expand Up @@ -43,7 +43,3 @@ jobs:
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/inference
- name: Run pipelines tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
43 changes: 43 additions & 0 deletions .github/workflows/test_inf1_pipelines.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Optimum neuron / Test INF1 pipelines

on:
push:
branches: [ main ]
paths:
- "optimum/neuron/pipelines/**.py"
pull_request:
branches: [ main ]
paths:
- "optimum/neuron/pipelines/**.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF1 tests
runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install system packages
run: |
sudo apt install python3.8-venv -y
- name: Install python packages
run: |
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuron,tests]
python -m pip uninstall optimum -y
python -m pip install optimum
- name: Run pipelines tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/pipelines
5 changes: 3 additions & 2 deletions .github/workflows/test_inf2_export.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Optimum neuron / Test INF2 export
name: Optimum neuron / Test INF2 partial export

on:
push:
Expand All @@ -18,7 +18,7 @@ concurrency:

jobs:
do-the-job:
name: Run INF2 tests
name: Run INF2 export tests
runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
Expand All @@ -38,4 +38,5 @@ jobs:
- name: Run exporters tests
run: |
source aws_neuron_venv_pytorch/bin/activate
export MAX_EXPORT_TEST_COMBINATIONS=1
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
39 changes: 39 additions & 0 deletions .github/workflows/test_inf2_full_export.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Optimum neuron / Test INF2 full export

on:
push:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"
pull_request:
branches: [ main ]
paths:
- "optimum/exporters/neuron/*.py"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
do-the-job:
name: Run INF2 full export tests
runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
env:
AWS_REGION: us-east-1
steps:
- name: Check AMI
run: dpkg -l | grep neuron
- name: Checkout
uses: actions/checkout@v2
- name: Install python dependencies
run: |
sudo apt install python3.8-venv -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install .[neuronx,tests]
- name: Run exporters tests
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/exporters
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@

include README.md
include LICENSE
include optimum/neuron/utils/neuron_cc_wrapper
77 changes: 19 additions & 58 deletions optimum/commands/neuron/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
CACHE_REPO_NAME,
HF_HOME_CACHE_REPO_FILE,
create_custom_cache_repo,
list_in_registry,
load_custom_cache_repo_name_from_hf_home,
set_custom_cache_repo_name_in_hf_home,
)
from ...neuron.utils.runner import ExampleRunner
Expand Down Expand Up @@ -163,52 +161,6 @@ def run(self):
)


class ListRepoCommand(BaseOptimumCLICommand):
@staticmethod
def parse_args(parser: "ArgumentParser"):
parser.add_argument(
"name",
type=str,
nargs="?",
default=None,
help="The name of the repo to list. Will use the locally saved cache repo if left unspecified.",
)
parser.add_argument(
"-m",
"--model",
type=str,
default=None,
help="The model name or path of the model to consider. If left unspecified, will list all available models.",
)
parser.add_argument(
"-v",
"--version",
type=str,
default=None,
help=(
"The version of the Neuron X Compiler to consider. Will list all available versions if left "
"unspecified."
),
)

def run(self):
if self.args.name is None:
custom_cache_repo_name = load_custom_cache_repo_name_from_hf_home()
if custom_cache_repo_name is None:
raise ValueError("No custom cache repo was set locally so you need to specify a cache repo name.")
self.args.name = custom_cache_repo_name

entries = list_in_registry(
self.args.name, model_name_or_path_or_hash=self.args.model, neuron_compiler_version=self.args.version
)
if not entries:
entries = ["Nothing was found."]
line = "\n" + "=" * 50 + "\n"
result = line.join(entries)

print(f"\n*** Repo id: {self.args.name} ***\n\n{result}")


class SynchronizeRepoCommand(BaseOptimumCLICommand):
@staticmethod
def parse_args(parser: "ArgumentParser"):
Expand All @@ -226,18 +178,32 @@ def parse_args(parser: "ArgumentParser"):
type=str,
help="The model_id to lookup cached versions for.",
)
parser.add_argument(
"--mode",
type=str,
choices=["training", "inference", "all"],
default="all",
help='The mode you wish to lookup compilation files for. Can be either "training", "inference" or "all"',
)
parser.add_argument("--repo_id", type=str, default=None, help="The name of the repo to use as remote cache.")

def run(self):
entries = get_hub_cached_entries(self.args.model_id, cache_repo_id=self.args.repo_id)
def _list_entries(self, mode: str):
entries = get_hub_cached_entries(self.args.model_id, mode, cache_repo_id=self.args.repo_id)
n_entries = len(entries)
output = f"\n*** {n_entries} entrie(s) found in cache for {self.args.model_id} ***\n\n"
output = f"\n*** {n_entries} entrie(s) found in cache for {self.args.model_id} for {mode}.***\n\n"
for entry in entries:
for key, value in entry.items():
output += f"\n{key}: {value}"
output += "\n"
print(output)

def run(self):
if self.args.mode == "all":
self._list_entries("training")
self._list_entries("inference")
else:
self._list_entries(self.args.mode)


class CustomCacheRepoCommand(BaseOptimumCLICommand):
SUBCOMMANDS = (
Expand All @@ -256,19 +222,14 @@ class CustomCacheRepoCommand(BaseOptimumCLICommand):
help="Add a model to the cache of your choice (trainium only).",
subcommand_class=AddToCacheRepoCommand,
),
CommandInfo(
name="list",
help="List models in a cache repo (trainium only).",
subcommand_class=ListRepoCommand,
),
CommandInfo(
name="synchronize",
help="Synchronize the neuronx compiler cache with a hub cache repo (inferentia only).",
help="Synchronize the neuronx compiler cache with a hub cache repo.",
subcommand_class=SynchronizeRepoCommand,
),
CommandInfo(
name="lookup",
help="Lookup the neuronx compiler hub cache for the specified model id (inferentia only).",
help="Lookup the neuronx compiler hub cache for the specified model id.",
subcommand_class=LookupRepoCommand,
),
)
2 changes: 1 addition & 1 deletion optimum/neuron/modeling_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def __init__(
cache_entry = None if checkpoint_id is None else ModelCacheEntry(checkpoint_id, config)

# Export the model using the Optimum Neuron Cache
with hub_neuronx_cache(entry=cache_entry):
with hub_neuronx_cache("inference", entry=cache_entry):
available_cores = get_available_cores()
if num_cores > available_cores:
raise ValueError(
Expand Down
Loading

0 comments on commit d319856

Please sign in to comment.