separate e2e test for train API

Signed-off-by: helenxie-bit <[email protected]>
kubeflow · Sep 3, 2024 · 1a0c455 · 1a0c455
1 parent 85fd8e6
commit 1a0c455
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 81 deletions.
diff --git a/.github/workflows/e2e-test-train-api.yaml b/.github/workflows/e2e-test-train-api.yaml
@@ -0,0 +1,59 @@
+name: E2E Test with train API
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Free-Up Disk Space
+        uses: ./.github/workflows/free-up-disk-space
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Setup Go
+        uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+
+      - name: Create k8s Kind Cluster
+        uses: helm/[email protected]
+        with:
+          node_image: kindest/node:${{ matrix.kubernetes-version }}
+          cluster_name: training-operator-cluster
+          kubectl_version: ${{ matrix.kubernetes-version }}
+
+      - name: Build training-operator
+        run: |
+          ./scripts/gha/build-image.sh
+        env:
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+      - name: Deploy training operator
+        run: |
+          ./scripts/gha/setup-training-operator.sh
+        env:
+          KIND_CLUSTER: training-operator-cluster
+          TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+          GANG_SCHEDULER_NAME: "none"
+          KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}
+
+      - name: Run tests
+        run: |
+          pip install pytest
+          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
diff --git a/.github/workflows/integration-tests.yaml b/.github/workflows/integration-tests.yaml
@@ -96,7 +96,7 @@ jobs:
       - name: Run tests
         run: |
           pip install pytest
-          python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
+          python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
         env:
           GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}
 

diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py
@@ -13,10 +13,8 @@
 # limitations under the License.
 
 import os
-import json
 import logging
 import pytest
-import subprocess
 from typing import Optional
 
 from kubernetes.client import V1PodTemplateSpec
@@ -25,10 +23,6 @@
 from kubernetes.client import V1Container
 from kubernetes.client import V1ResourceRequirements
 
-from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
-from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams
-
 from kubeflow.training import TrainingClient
 from kubeflow.training import KubeflowOrgV1ReplicaSpec
 from kubeflow.training import KubeflowOrgV1PyTorchJob
@@ -37,9 +31,6 @@
 from kubeflow.training import KubeflowOrgV1SchedulingPolicy
 from kubeflow.training import constants
 
-from peft import LoraConfig
-import transformers
-
 import test.e2e.utils as utils
 from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
 from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
@@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
     TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
 
 
-@pytest.mark.skipif(
-    GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
-    reason="For plain scheduling",
-)
-def test_sdk_e2e_create_from_train_api(job_namespace):
-    JOB_NAME = "pytorchjob-from-train-api"
-
-    # Use test case from fine-tuning API tutorial.
-    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
-    TRAINING_CLIENT.train(
-        name=JOB_NAME,
-        namespace=job_namespace,
-        # BERT model URI and type of Transformer to train it.
-        model_provider_parameters=HuggingFaceModelParams(
-            model_uri="hf://google-bert/bert-base-cased",
-            transformer_type=transformers.AutoModelForSequenceClassification,
-            num_labels=5,
-        ),
-        # In order to save test time, use 8 samples from Yelp dataset.
-        dataset_provider_parameters=HuggingFaceDatasetParams(
-            repo_id="yelp_review_full",
-            split="train[:8]",
-        ),
-        # Specify HuggingFace Trainer parameters.
-        trainer_parameters=HuggingFaceTrainerParams(
-            training_parameters=transformers.TrainingArguments(
-                output_dir="test_trainer",
-                save_strategy="no",
-                evaluation_strategy="no",
-                do_eval=False,
-                disable_tqdm=True,
-                log_level="info",
-                num_train_epochs=1,
-            ),
-            # Set LoRA config to reduce number of trainable model parameters.
-            lora_config=LoraConfig(
-                r=8,
-                lora_alpha=8,
-                lora_dropout=0.1,
-                bias="none",
-            ),
-        ),
-        num_workers=1,  # nodes parameter for torchrun command.
-        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
-        resources_per_worker={
-            "gpu": 0,
-            "cpu": 1,
-            "memory": "5G",
-        },
-        storage_config={
-            "size": "5Gi",
-            "access_modes": ["ReadWriteOnce"],
-        },
-    )
-
-    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
-    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
-
-    try:
-        utils.verify_job_e2e(
-            TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
-        )
-    except Exception as e:
-        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
-
-    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
-    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
-
-
 def generate_pytorchjob(
     job_namespace: str,
     job_name: str,

diff --git a/sdk/python/test_train_api/test_e2e_train_api.py b/sdk/python/test_train_api/test_e2e_train_api.py
@@ -0,0 +1,96 @@
+# Copyright 2024 kubeflow.org.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import test.e2e.utils as utils
+
+import transformers
+from kubeflow.storage_initializer.hugging_face import (
+    HuggingFaceDatasetParams,
+    HuggingFaceModelParams,
+    HuggingFaceTrainerParams,
+)
+from kubeflow.training import TrainingClient, constants
+from peft import LoraConfig
+
+logging.basicConfig(format="%(message)s")
+logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)
+
+TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
+CONTAINER_NAME = "pytorch"
+
+
+def test_sdk_e2e_create_from_train_api(job_namespace="default"):
+    JOB_NAME = "pytorchjob-from-train-api"
+
+    # Use test case from fine-tuning API tutorial.
+    # https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
+    TRAINING_CLIENT.train(
+        name=JOB_NAME,
+        namespace=job_namespace,
+        # BERT model URI and type of Transformer to train it.
+        model_provider_parameters=HuggingFaceModelParams(
+            model_uri="hf://google-bert/bert-base-cased",
+            transformer_type=transformers.AutoModelForSequenceClassification,
+            num_labels=5,
+        ),
+        # In order to save test time, use 8 samples from Yelp dataset.
+        dataset_provider_parameters=HuggingFaceDatasetParams(
+            repo_id="yelp_review_full",
+            split="train[:8]",
+        ),
+        # Specify HuggingFace Trainer parameters.
+        trainer_parameters=HuggingFaceTrainerParams(
+            training_parameters=transformers.TrainingArguments(
+                output_dir="test_trainer",
+                save_strategy="no",
+                evaluation_strategy="no",
+                do_eval=False,
+                disable_tqdm=True,
+                log_level="info",
+                num_train_epochs=1,
+            ),
+            # Set LoRA config to reduce number of trainable model parameters.
+            lora_config=LoraConfig(
+                r=8,
+                lora_alpha=8,
+                lora_dropout=0.1,
+                bias="none",
+            ),
+        ),
+        num_workers=1,  # nodes parameter for torchrun command.
+        num_procs_per_worker=1,  # nproc-per-node parameter for torchrun command.
+        resources_per_worker={
+            "gpu": 0,
+            "cpu": 2,
+            "memory": "10G",
+        },
+        storage_config={
+            "size": "10Gi",
+            "access_modes": ["ReadWriteOnce"],
+        },
+    )
+
+    logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
+    logging.info(TRAINING_CLIENT.list_jobs(job_namespace))
+
+    try:
+        utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
+    except Exception as e:
+        utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+        TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
+        raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")
+
+    utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
+    TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)