From 9f4244909b44f0a8be3c7234d90c808728e94b71 Mon Sep 17 00:00:00 2001 From: helenxie-bit Date: Fri, 20 Sep 2024 20:56:38 -0700 Subject: [PATCH] build and verify images of storage-intializer and trainer Signed-off-by: helenxie-bit --- .github/workflows/e2e-test-train-api.yaml | 13 +++++++++++-- scripts/gha/build-image.sh | 2 ++ sdk/python/kubeflow/training/api/training_client.py | 9 +++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml index 776aa65a92..20b19ec769 100644 --- a/.github/workflows/e2e-test-train-api.yaml +++ b/.github/workflows/e2e-test-train-api.yaml @@ -33,11 +33,13 @@ jobs: cluster_name: training-operator-cluster kubectl_version: ${{ matrix.kubernetes-version }} - - name: Build training-operator + - name: Build training-operator, storage-initializer, and trainer images run: | ./scripts/gha/build-image.sh env: TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test + STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_CI_IMAGE: kubeflowtraining/trainer:test - name: Deploy training operator run: | @@ -50,5 +52,12 @@ jobs: - name: Run tests run: | + kind load docker-image ${{ env.STORAGE_INITIALIZER_IMAGE }} --name ${{ env.KIND_CLUSTER }} + kind load docker-image ${{ env.TRAINER_TRANSFORMER_IMAGE_DEFAULT }} --name ${{ env.KIND_CLUSTER }} pip install pytest - python3 -m pip install -e sdk/python[huggingface]; pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug + python3 -m pip install -e sdk/python[huggingface] + pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug + env: + KIND_CLUSTER: training-operator-cluster + STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test + TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test diff --git a/scripts/gha/build-image.sh b/scripts/gha/build-image.sh index cb4f0fc832..7c2947bdce 100755 --- a/scripts/gha/build-image.sh +++ b/scripts/gha/build-image.sh @@ -22,3 +22,5 @@ set -o nounset set -o pipefail docker build . -t ${TRAINING_CI_IMAGE} -f build/images/training-operator/Dockerfile +docker build . -t ${STORAGE_INITIALIZER_CI_IMAGE} -f sdk/python/kubeflow/storage_initializer/Dockerfile +docker build . -t ${TRAINER_CI_IMAGE} -f sdk/python/kubeflow/trainer/Dockerfile diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 1626f18820..459e16a046 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -258,6 +258,10 @@ def train( ], volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], ) + base_image1=os.getenv( + "STORAGE_INITIALIZER_IMAGE", constants.STORAGE_INITIALIZER_IMAGE_DEFAULT + ) + print("base_image1: " + base_image1) # create app container spec container_spec = utils.get_container_spec( @@ -287,6 +291,11 @@ def train( volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT], resources=resources_per_worker, ) + base_image2=os.getenv( + "TRAINER_TRANSFORMER_IMAGE_DEFAULT", + constants.TRAINER_TRANSFORMER_IMAGE_DEFAULT, + ) + print("base_image2: " + base_image2) storage_initializer_volume = models.V1Volume( name=constants.STORAGE_INITIALIZER,