Skip to content

Parallel cross entropy #51

Parallel cross entropy

Parallel cross entropy #51

name: Optimum Neuron - Common tests on Trainium
on:
push:
branches: [ main ]
paths:
- "optimum/**.py"
pull_request:
branches: [ main ]
paths:
- "optimum/**.py"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
EC2_AMI_ID: ${{ vars.TRAINIUM_AMI_ID }}
EC2_INSTANCE_TYPE: trn1.2xlarge
EC2_SUBNET_ID: subnet-859322b4,subnet-b7533b96,subnet-47cfad21,subnet-a396b2ad,subnet-06576a4b,subnet-df0f6180
EC2_SECURITY_GROUP: sg-0bb210cd3ec725a13
EC2_IAM_ROLE: optimum-ec2-github-actions-role
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ env.EC2_AMI_ID }}
ec2-instance-type: ${{ env.EC2_INSTANCE_TYPE }}
subnet-id: ${{ env.EC2_SUBNET_ID }}
security-group-id: ${{ env.EC2_SECURITY_GROUP }}
iam-role-name: ${{ env.EC2_IAM_ROLE }}
aws-resource-tags: > # optional, requires additional permissions
[
{"Key": "Name", "Value": "ec2-optimum-github-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
]
optimum-neuron-tests:
needs: start-runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
env:
AWS_REGION: us-east-1
TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
steps:
- name: Checkout
uses: actions/checkout@v2
# - name: Install python3.8-venv
# run: sudo apt update; sudo apt install -y python3.8-venv
- name: Set pip repository pointing to the Neuron repository
run: pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install Python dependencies
run: pip install .[tests,neuronx]
- name: Run tests on Neuron cores
run: |
HF_TOKEN_OPTIMUM_NEURON_CI=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests
- name: Run staging tests on Neuron cores
run: HUGGINGFACE_CO_STAGING=1 pytest -m "is_trainium_test and is_staging_test" $TESTS_TO_IGNORE_FLAGS tests
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- optimum-neuron-tests
runs-on: ubuntu-latest
env:
AWS_REGION: us-east-1
if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
steps:
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Stop EC2 runner
uses: philschmid/philschmid-ec2-github-runner@main
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}