Skip to content

Add e2e test for train API #49

Add e2e test for train API

Add e2e test for train API #49

name: E2E Test with train API
on:
- pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e-test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.28.7"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Create k8s Kind Cluster
uses: helm/[email protected]
with:
node_image: kindest/node:${{ matrix.kubernetes-version }}
cluster_name: training-operator-cluster
kubectl_version: ${{ matrix.kubernetes-version }}
- name: Build training-operator
run: |
./scripts/gha/build-image.sh
env:
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
- name: Deploy training operator
run: |
./scripts/gha/setup-training-operator.sh
env:
KIND_CLUSTER: training-operator-cluster
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
GANG_SCHEDULER_NAME: "none"
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -h
- name: Build storage initializer and trainer
run: |
./scripts/gha/setup-storage-initializer-and-trainer.sh
docker system df
df -h
env:
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Load storage initializer
run: |
kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
docker system df
df -h
env:
KIND_CLUSTER: training-operator-cluster
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
- name: Remove image
run: |
docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
docker system df
df -h
env:
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
- name: Monitor resources usage of node
run: |
echo "Monitor resources usage of node"
kubectl describe nodes training-operator-cluster-control-plane
echo "Monitor resources usage of pods"
kubectl get pods --all-namespaces
echo "Monitor resources usage of storage"
docker exec -it training-operator-cluster-control-plane df -h
- name: Load trainer
run: |
kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
docker rmi ${{ env.TRAINER_CI_IMAGE }}
docker system df
df -h
env:
KIND_CLUSTER: training-operator-cluster
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python[huggingface]
pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
env:
STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test