Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: Restore rosetta-t5x-test #749

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ jobs:
DOCKERFILE: .github/container/Dockerfile.t5x.${{ inputs.ARCHITECTURE }}
secrets: inherit

build-rosetta-t5x:
needs: build-upstream-t5x
uses: ./.github/workflows/_build_rosetta.yaml
with:
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
BUILD_DATE: ${{ inputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
BASE_LIBRARY: t5x
secrets: inherit

build-upstream-pax:
needs: build-jax
uses: ./.github/workflows/_build.yaml
Expand All @@ -130,16 +140,6 @@ jobs:
DOCKERFILE: .github/container/Dockerfile.pax.${{ inputs.ARCHITECTURE }}
secrets: inherit

build-rosetta-t5x:
needs: build-upstream-t5x
uses: ./.github/workflows/_build_rosetta.yaml
with:
ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
BUILD_DATE: ${{ inputs.BUILD_DATE }}
BASE_IMAGE: ${{ needs.build-upstream-t5x.outputs.DOCKER_TAG_MEALKIT }}
BASE_LIBRARY: t5x
secrets: inherit

build-rosetta-pax:
needs: build-upstream-pax
uses: ./.github/workflows/_build_rosetta.yaml
Expand Down Expand Up @@ -310,7 +310,7 @@ jobs:
test-rosetta-t5x:
needs: build-rosetta-t5x
if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
uses: ./.github/workflows/_test_t5x_rosetta.yaml
uses: ./.github/workflows/_test_rosetta_t5x.yaml
with:
T5X_IMAGE: ${{ needs.build-rosetta-t5x.outputs.DOCKER_TAG_FINAL }}
secrets: inherit
Expand Down Expand Up @@ -433,7 +433,7 @@ jobs:
test-rosetta-pax:
needs: build-rosetta-pax
if: inputs.ARCHITECTURE == 'amd64' # no images for arm64
uses: ./.github/workflows/_test_pax_rosetta.yaml
uses: ./.github/workflows/_test_rosetta_pax.yaml
with:
PAX_IMAGE: ${{ needs.build-rosetta-pax.outputs.DOCKER_TAG_FINAL }}
secrets: inherit
Expand Down
97 changes: 0 additions & 97 deletions .github/workflows/_test_rosetta.yaml

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,33 @@ on:
T5X_IMAGE:
type: string
description: T5X image from ghcr.io/nvidia/t5x
default: 'ghcr.io/nvidia/t5x:latest'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To my understanding, single quotes are preferred in GitHub Actions as they are less likely to cause unintended effects (not 100% sure how much practical impact it will bring though). Besides, it is better to be consistent across code base 😄

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I changed my formatter's default to single-quotes

default: "ghcr.io/nvidia/t5x:latest"
required: false
BADGE_FILENAME:
type: string
description: 'Name of the endpoint JSON file for shields.io badge'
description: "Name of the endpoint JSON file for shields.io badge"
required: false
default: 'badge-rosetta-t5x-mgmn-test.json'
default: "badge-rosetta-t5x-mgmn-test.json"
ARTIFACT_NAME:
type: string
description: 'Name of the artifact zip file'
description: "Name of the artifact zip file"
required: false
default: 'artifact-rosetta-t5x-mgmn-test'
default: "artifact-rosetta-t5x-mgmn-test"
FW_NAME:
type: string
description: 'Name of the framework being used'
description: "Name of the framework being used"
required: false
default: 'rosetta-t5x'
default: "rosetta-t5x"
outputs:
TEST_STATUS:
description: 'Summary pass/fail value indicating if results from tests are acceptable'
description: "Summary pass/fail value indicating if results from tests are acceptable"
value: ${{ jobs.sitrep.outputs.STATUS }}

env:
BATCH_SIZE_PER_GPU: 32
VIT_BATCH_SIZE_PER_GPU: 256

jobs:

single-process-multi-device:
strategy:
matrix:
Expand Down Expand Up @@ -63,10 +62,10 @@ jobs:
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}

- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4

- name: Setup SSH known hosts
id: ssh-known-hosts
run: |
Expand Down Expand Up @@ -182,7 +181,7 @@ jobs:
dump = {'state': "${{ steps.submit.outputs.SLURM_STATE }}", 'exitcode': "${{ steps.submit.outputs.SLURM_EXITCODE }}"}
json.dump(dump, f)
EOF

- name: Generate sitrep
if: success() || failure()
shell: bash -x -e {0}
Expand All @@ -196,7 +195,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand Down Expand Up @@ -402,7 +401,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand All @@ -429,7 +428,7 @@ jobs:
color="${badge_color}" \
to_json schemaVersion label message color \
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json

- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
Expand Down Expand Up @@ -571,7 +570,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand Down Expand Up @@ -744,7 +743,7 @@ jobs:
passed_tests=$(jq -r '. | select ((.state == "COMPLETED") and (.exitcode == "0")) | .state' $EXIT_STATUSES | wc -l)
failed_tests=$(jq -r '. | select ((.state != "COMPLETED") or (.exitcode != "0")) | .state' $EXIT_STATUSES | wc -l)
total_tests=$(ls $EXIT_STATUSES | wc -l)

if [[ ${failed_tests} > 0 ]] || [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
Expand All @@ -771,15 +770,21 @@ jobs:
color="${badge_color}" \
to_json schemaVersion label message color \
> output/${{ env.BADGE_FILENAME_PREFIX }}-${{ steps.meta.outputs.TEST_CASE_NAME }}.json

- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ steps.meta.outputs.JOB_NAME }}
path: output/*

metrics:
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
needs:
[
multi-gpu-multi-node,
single-process-multi-device,
vit-single-process-multi-device,
vit-multi-gpu-multi-node,
]
runs-on: ubuntu-22.04

steps:
Expand Down Expand Up @@ -810,7 +815,7 @@ jobs:
path: |
report.jsonl
*_metrics.json

sitrep:
needs: metrics
if: "!cancelled()"
Expand All @@ -820,10 +825,16 @@ jobs:
BADGE_FILENAME: ${{ inputs.BADGE_FILENAME }}
ARTIFACT_NAME: ${{ inputs.ARTIFACT_NAME }}
FW_NAME: ${{ inputs.FW_NAME }}

summary:
runs-on: ubuntu-22.04
needs: [multi-gpu-multi-node, single-process-multi-device, vit-single-process-multi-device, vit-multi-gpu-multi-node]
needs:
[
multi-gpu-multi-node,
single-process-multi-device,
vit-single-process-multi-device,
vit-multi-gpu-multi-node,
]
if: "!cancelled()"
steps:
- name: Generate TensorBoard query URL
Expand All @@ -848,3 +859,79 @@ jobs:
if [[ ${{ needs.sitrep.outputs.STATUS }} != success ]]; then
exit 1
fi

unit-tests:
runs-on: [self-hosted, V100]
env:
TEST_ARTIFACT_NAME: rosetta-test-logs
TEST_LOG_LOCAL_PATH: /log/unit-report.jsonl
steps:
- name: Print environment variables
run: |
env

- name: Print GPU information
run: nvidia-smi

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Pull Rosetta image
shell: bash -x -e {0}
run: |
docker pull ${{ inputs.T5X_IMAGE }}
docker tag ${{ inputs.T5X_IMAGE }} rosetta:latest

- name: Run Rosetta tests w/ docker
shell: docker run --gpus all -v {0}:/cmd.sh -v /log:/log rosetta:latest bash -x -e /cmd.sh
run: |
ROSETTA_PATH=$(dirname $(python -c "import rosetta; print(*rosetta.__path__)"))
pip install "${ROSETTA_PATH}[test]" pytest-reportlog
pytest --report-log=${{ env.TEST_LOG_LOCAL_PATH }} ${ROSETTA_PATH} || true

- name: Upload unit test json logs
uses: actions/upload-artifact@v4
with:
name: ${{ env.TEST_ARTIFACT_NAME }}
path: ${{ env.TEST_LOG_LOCAL_PATH }}

publish-test:
needs: unit-tests
uses: ./.github/workflows/_publish_badge.yaml
if: ( always() )
secrets: inherit
with:
ENDPOINT_FILENAME: "rosetta-unit-test-status.json"
PUBLISH: false
SCRIPT: |
ARTIFACTS="${{ needs.rosetta-unit-tests.outputs.TEST_ARTIFACT_NAME }}/*.jsonl"
all_outcomes() {
cat $ARTIFACTS | jq -r '. | select((.["$report_type"] == "TestReport") and (.when == "call")) | .outcome'
}
cnt_type() {
cat $ARTIFACTS | jq '. | select((.["$report_type"] == "TestReport") and (.when == "call") and (.outcome | contains("'${1}'"))) | .outcome' | wc -l
}
SKIPPED_TESTS=$(cnt_type skipped)
FAILED_TESTS=$(cnt_type failed)
PASSED_TESTS=$(cnt_type passed)
TOTAL_TESTS=$(all_outcomes | wc -l)
echo "## Unit/Integration test breakdown" | tee -a $GITHUB_STEP_SUMMARY
all_outcomes | sort | uniq -c | tee -a $GITHUB_STEP_SUMMARY
if [[ $FAILED_TESTS -eq 0 ]] && [[ $TOTAL_TESTS -gt 0 ]]; then
BADGE_COLOR=brightgreen
echo "STATUS=success" >> $GITHUB_OUTPUT
else
echo "STATUS=failure" >> $GITHUB_OUTPUT
if [[ $PASSED_TESTS -eq 0 ]]; then
BADGE_COLOR=red
else
BADGE_COLOR=yellow
fi
fi
echo "LABEL='V100 Unit'" >> $GITHUB_OUTPUT
echo "MESSAGE='${PASSED_TESTS}/${SKIPPED_TESTS}/${FAILED_TESTS} pass/skip/fail'" >> $GITHUB_OUTPUT
echo "COLOR='${BADGE_COLOR}'" >> $GITHUB_OUTPUT