.github/workflows/test_cuda_torch_ort_training.yaml

name: CUDA OnnxRuntime Training Tests

on:
  workflow_dispatch:
  push:
    branches: [main]
  pull_request:
    types: [opened, reopened, synchronize]

concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true

jobs:
  build_image_and_run_gpu_tests:
    runs-on: hf-dgx-01
    steps:
      - name: Checkout
        uses: actions/checkout@v3

      - name: Build image
        run: docker build
          --file docker/cuda.dockerfile
          --build-arg USER_ID=$(id -u)
          --build-arg GROUP_ID=$(id -g)
          --build-arg CUDA_VERSION=11.8.0
          --build-arg TORCH_CUDA=cu118
          --tag opt-bench-cuda:11.8.0
          .

      - name: Run tests
        run: docker run
          --rm
          --net host
          --pid host
          --shm-size 64G
          --env USE_CUDA="1"
          --entrypoint /bin/bash
          --volume $(pwd):/workspace/optimum-benchmark
          --workdir /workspace/optimum-benchmark
          --gpus '"device=0,1"'
          opt-bench-cuda:11.8.0
          -c "pip install -e .[testing,torch-ort,peft] && python -m torch_ort.configure && pytest -k 'cli and cuda and onnxruntime and training' -x"