-
Notifications
You must be signed in to change notification settings - Fork 698
110 lines (96 loc) · 3.54 KB
/
e2e-test-train-api.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
name: E2E Test with train API
on:
- pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
e2e-test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.28.7"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Create k8s Kind Cluster
uses: helm/[email protected]
with:
node_image: kindest/node:${{ matrix.kubernetes-version }}
cluster_name: training-operator-cluster
kubectl_version: ${{ matrix.kubernetes-version }}
- name: Build training-operator
run: |
./scripts/gha/build-image.sh
env:
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
- name: Deploy training operator
run: |
./scripts/gha/setup-training-operator.sh
env:
KIND_CLUSTER: training-operator-cluster
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
GANG_SCHEDULER_NAME: "none"
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
- name: Prune docker images
shell: bash
run: |
docker image prune -a -f
docker system df
df -h
- name: Build storage initializer and trainer
run: |
./scripts/gha/setup-storage-initializer-and-trainer.sh
docker system df
df -h
env:
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Load storage initializer
run: |
kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
docker system df
df -h
env:
KIND_CLUSTER: training-operator-cluster
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
- name: Remove image
run: |
docker rmi ${{ env.STORAGE_INITIALIZER_CI_IMAGE}}
docker system df
df -h
env:
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test
- name: Monitor resources usage of node
run: |
echo "Monitor resources usage of node"
kubectl describe nodes training-operator-cluster-control-plane
echo "Monitor resources usage of pods"
kubectl get pods --all-namespaces
echo "Monitor resources usage of storage"
docker exec -it training-operator-cluster-control-plane df -h
- name: Load trainer
run: |
kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }}
docker rmi ${{ env.TRAINER_CI_IMAGE }}
docker system df
df -h
env:
KIND_CLUSTER: training-operator-cluster
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python[huggingface]
pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug
env:
STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test
TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test