Skip to content

Commit

Permalink
separate e2e test for train API
Browse files Browse the repository at this point in the history
Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit committed Sep 3, 2024
1 parent 85fd8e6 commit 1a0c455
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 81 deletions.
59 changes: 59 additions & 0 deletions .github/workflows/e2e-test-train-api.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: E2E Test with train API
on:
- pull_request

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
e2e-test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Free-Up Disk Space
uses: ./.github/workflows/free-up-disk-space

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Setup Go
uses: actions/setup-go@v5
with:
go-version-file: go.mod

- name: Create k8s Kind Cluster
uses: helm/[email protected]
with:
node_image: kindest/node:${{ matrix.kubernetes-version }}
cluster_name: training-operator-cluster
kubectl_version: ${{ matrix.kubernetes-version }}

- name: Build training-operator
run: |
./scripts/gha/build-image.sh
env:
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test

- name: Deploy training operator
run: |
./scripts/gha/setup-training-operator.sh
env:
KIND_CLUSTER: training-operator-cluster
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
GANG_SCHEDULER_NAME: "none"
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }}

- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug
2 changes: 1 addition & 1 deletion .github/workflows/integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
- name: Run tests
run: |
pip install pytest
python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
python3 -m pip install -e sdk/python; pytest -s sdk/python/test --log-cli-level=debug --namespace=default
env:
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }}

Expand Down
80 changes: 0 additions & 80 deletions sdk/python/test/e2e/test_e2e_pytorchjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
# limitations under the License.

import os
import json
import logging
import pytest
import subprocess
from typing import Optional

from kubernetes.client import V1PodTemplateSpec
Expand All @@ -25,10 +23,6 @@
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements

from kubeflow.storage_initializer.hugging_face import HuggingFaceDatasetParams
from kubeflow.storage_initializer.hugging_face import HuggingFaceModelParams
from kubeflow.storage_initializer.hugging_face import HuggingFaceTrainerParams

from kubeflow.training import TrainingClient
from kubeflow.training import KubeflowOrgV1ReplicaSpec
from kubeflow.training import KubeflowOrgV1PyTorchJob
Expand All @@ -37,9 +31,6 @@
from kubeflow.training import KubeflowOrgV1SchedulingPolicy
from kubeflow.training import constants

from peft import LoraConfig
import transformers

import test.e2e.utils as utils
from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
Expand Down Expand Up @@ -249,77 +240,6 @@ def test_sdk_e2e_create_from_image(job_namespace):
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)


@pytest.mark.skipif(
GANG_SCHEDULER_NAME in GANG_SCHEDULERS,
reason="For plain scheduling",
)
def test_sdk_e2e_create_from_train_api(job_namespace):
JOB_NAME = "pytorchjob-from-train-api"

# Use test case from fine-tuning API tutorial.
# https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
TRAINING_CLIENT.train(
name=JOB_NAME,
namespace=job_namespace,
# BERT model URI and type of Transformer to train it.
model_provider_parameters=HuggingFaceModelParams(
model_uri="hf://google-bert/bert-base-cased",
transformer_type=transformers.AutoModelForSequenceClassification,
num_labels=5,
),
# In order to save test time, use 8 samples from Yelp dataset.
dataset_provider_parameters=HuggingFaceDatasetParams(
repo_id="yelp_review_full",
split="train[:8]",
),
# Specify HuggingFace Trainer parameters.
trainer_parameters=HuggingFaceTrainerParams(
training_parameters=transformers.TrainingArguments(
output_dir="test_trainer",
save_strategy="no",
evaluation_strategy="no",
do_eval=False,
disable_tqdm=True,
log_level="info",
num_train_epochs=1,
),
# Set LoRA config to reduce number of trainable model parameters.
lora_config=LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
bias="none",
),
),
num_workers=1, # nodes parameter for torchrun command.
num_procs_per_worker=1, # nproc-per-node parameter for torchrun command.
resources_per_worker={
"gpu": 0,
"cpu": 1,
"memory": "5G",
},
storage_config={
"size": "5Gi",
"access_modes": ["ReadWriteOnce"],
},
)

logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))

try:
utils.verify_job_e2e(
TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=300
)
except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")

utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)


def generate_pytorchjob(
job_namespace: str,
job_name: str,
Expand Down
96 changes: 96 additions & 0 deletions sdk/python/test_train_api/test_e2e_train_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2024 kubeflow.org.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import test.e2e.utils as utils

import transformers
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceDatasetParams,
HuggingFaceModelParams,
HuggingFaceTrainerParams,
)
from kubeflow.training import TrainingClient, constants
from peft import LoraConfig

logging.basicConfig(format="%(message)s")
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG)

TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND)
CONTAINER_NAME = "pytorch"


def test_sdk_e2e_create_from_train_api(job_namespace="default"):
JOB_NAME = "pytorchjob-from-train-api"

# Use test case from fine-tuning API tutorial.
# https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/
TRAINING_CLIENT.train(
name=JOB_NAME,
namespace=job_namespace,
# BERT model URI and type of Transformer to train it.
model_provider_parameters=HuggingFaceModelParams(
model_uri="hf://google-bert/bert-base-cased",
transformer_type=transformers.AutoModelForSequenceClassification,
num_labels=5,
),
# In order to save test time, use 8 samples from Yelp dataset.
dataset_provider_parameters=HuggingFaceDatasetParams(
repo_id="yelp_review_full",
split="train[:8]",
),
# Specify HuggingFace Trainer parameters.
trainer_parameters=HuggingFaceTrainerParams(
training_parameters=transformers.TrainingArguments(
output_dir="test_trainer",
save_strategy="no",
evaluation_strategy="no",
do_eval=False,
disable_tqdm=True,
log_level="info",
num_train_epochs=1,
),
# Set LoRA config to reduce number of trainable model parameters.
lora_config=LoraConfig(
r=8,
lora_alpha=8,
lora_dropout=0.1,
bias="none",
),
),
num_workers=1, # nodes parameter for torchrun command.
num_procs_per_worker=1, # nproc-per-node parameter for torchrun command.
resources_per_worker={
"gpu": 0,
"cpu": 2,
"memory": "10G",
},
storage_config={
"size": "10Gi",
"access_modes": ["ReadWriteOnce"],
},
)

logging.info(f"List of created {TRAINING_CLIENT.job_kind}s")
logging.info(TRAINING_CLIENT.list_jobs(job_namespace))

try:
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900)
except Exception as e:
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)
raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}")

utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace)
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace)

0 comments on commit 1a0c455

Please sign in to comment.