TorchBench Userbenchmark on A100 #80
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: TorchBench Userbenchmark on A100 | |
on: | |
schedule: | |
- cron: '00 18 * * *' # run at 6:00 PM UTC, K8s containers will roll out at 12PM EST | |
workflow_dispatch: | |
inputs: | |
userbenchmark_name: | |
description: "Name of the user benchmark to run" | |
userbenchmark_options: | |
description: "Option of the user benchmark to run" | |
jobs: | |
run-userbenchmark: | |
runs-on: [a100-runner] | |
timeout-minutes: 1440 # 24 hours | |
environment: docker-s3-upload | |
env: | |
BASE_CONDA_ENV: "torchbench" | |
CONDA_ENV: "userbenchmark-a100" | |
PLATFORM_NAME: "gcp_a100" | |
TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TORCHBENCH_USERBENCHMARK_SCRIBE_GRAPHQL_ACCESS_TOKEN }} | |
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
SETUP_SCRIPT: "/workspace/setup_instance.sh" | |
steps: | |
- name: Checkout TorchBench | |
uses: actions/checkout@v3 | |
with: | |
path: benchmark | |
- name: Tune Nvidia GPU | |
run: | | |
sudo nvidia-smi -pm 1 | |
sudo nvidia-smi -ac 1215,1410 | |
nvidia-smi | |
- name: Clone and setup conda env | |
run: | | |
CONDA_ENV=${BASE_CONDA_ENV} . "${SETUP_SCRIPT}" | |
conda create --name "${CONDA_ENV}" --clone "${BASE_CONDA_ENV}" | |
- name: Install TorchBench | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" | |
pushd benchmark | |
python install.py | |
- name: Run user benchmark | |
run: | | |
set -x | |
. "${SETUP_SCRIPT}" | |
# remove old results | |
if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi | |
pushd benchmark | |
if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi | |
MANUAL_WORKFLOW="${{ github.event.inputs.userbenchmark_name }}" | |
if [ -z "${MANUAL_WORKFLOW}" ]; then | |
# Figure out what userbenchmarks we should run, and run it | |
python ./.github/scripts/userbenchmark/schedule-benchmarks.py --platform ${PLATFORM_NAME} | |
if [ -d ./.userbenchmark ]; then | |
cp -r ./.userbenchmark ../benchmark-output | |
else | |
mkdir ../benchmark-output | |
fi | |
else | |
python run_benchmark.py "${{ github.event.inputs.userbenchmark_name }}" ${{ github.event.inputs.userbenchmark_options }} | |
cp -r ./.userbenchmark/"${{ github.event.inputs.userbenchmark_name }}" ../benchmark-output | |
fi | |
- name: Upload artifact | |
uses: actions/upload-artifact@v3 | |
with: | |
name: TorchBench result | |
path: benchmark-output/ | |
- name: Upload result jsons to Scribe and S3 | |
run: | | |
. "${SETUP_SCRIPT}" | |
pushd benchmark | |
RESULTS=($(find ${PWD}/../benchmark-output -name "metrics-*.json" -maxdepth 2 | sort -r)) | |
echo "Uploading result jsons: ${RESULTS}" | |
for r in ${RESULTS[@]}; do | |
python ./scripts/userbenchmark/upload_scribe.py --userbenchmark_json "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
python ./scripts/userbenchmark/upload_s3.py --upload-file "${r}" --userbenchmark_platform "${PLATFORM_NAME}" | |
done | |
- name: Clean up Conda env | |
if: always() | |
run: | | |
. "${SETUP_SCRIPT}" | |
conda deactivate && conda deactivate | |
conda remove -n "${CONDA_ENV}" --all |