-
Notifications
You must be signed in to change notification settings - Fork 698
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: helenxie-bit <[email protected]>
- Loading branch information
1 parent
85fd8e6
commit 1a0c455
Showing
4 changed files
with
156 additions
and
81 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
name: E2E Test with train API | ||
on: | ||
- pull_request | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
e2e-test: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
kubernetes-version: ["v1.27.11", "v1.28.7", "v1.29.2"] | ||
python-version: ["3.9", "3.10", "3.11"] | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v4 | ||
|
||
- name: Free-Up Disk Space | ||
uses: ./.github/workflows/free-up-disk-space | ||
|
||
- name: Setup Python | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Setup Go | ||
uses: actions/setup-go@v5 | ||
with: | ||
go-version-file: go.mod | ||
|
||
- name: Create k8s Kind Cluster | ||
uses: helm/[email protected] | ||
with: | ||
node_image: kindest/node:${{ matrix.kubernetes-version }} | ||
cluster_name: training-operator-cluster | ||
kubectl_version: ${{ matrix.kubernetes-version }} | ||
|
||
- name: Build training-operator | ||
run: | | ||
./scripts/gha/build-image.sh | ||
env: | ||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | ||
|
||
- name: Deploy training operator | ||
run: | | ||
./scripts/gha/setup-training-operator.sh | ||
env: | ||
KIND_CLUSTER: training-operator-cluster | ||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | ||
GANG_SCHEDULER_NAME: "none" | ||
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} | ||
|
||
- name: Run tests | ||
run: | | ||
pip install pytest | ||
python3 -m pip install -e sdk/python -r sdk/python/kubeflow/trainer/requirements.txt; pytest -s sdk/python/test_train_api/test_e2e_train_api.py --log-cli-level=debug |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# Copyright 2024 kubeflow.org. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import logging | ||
import test.e2e.utils as utils | ||
|
||
import transformers | ||
from kubeflow.storage_initializer.hugging_face import ( | ||
HuggingFaceDatasetParams, | ||
HuggingFaceModelParams, | ||
HuggingFaceTrainerParams, | ||
) | ||
from kubeflow.training import TrainingClient, constants | ||
from peft import LoraConfig | ||
|
||
logging.basicConfig(format="%(message)s") | ||
logging.getLogger("kubeflow.training.api.training_client").setLevel(logging.DEBUG) | ||
|
||
TRAINING_CLIENT = TrainingClient(job_kind=constants.PYTORCHJOB_KIND) | ||
CONTAINER_NAME = "pytorch" | ||
|
||
|
||
def test_sdk_e2e_create_from_train_api(job_namespace="default"): | ||
JOB_NAME = "pytorchjob-from-train-api" | ||
|
||
# Use test case from fine-tuning API tutorial. | ||
# https://www.kubeflow.org/docs/components/training/user-guides/fine-tuning/ | ||
TRAINING_CLIENT.train( | ||
name=JOB_NAME, | ||
namespace=job_namespace, | ||
# BERT model URI and type of Transformer to train it. | ||
model_provider_parameters=HuggingFaceModelParams( | ||
model_uri="hf://google-bert/bert-base-cased", | ||
transformer_type=transformers.AutoModelForSequenceClassification, | ||
num_labels=5, | ||
), | ||
# In order to save test time, use 8 samples from Yelp dataset. | ||
dataset_provider_parameters=HuggingFaceDatasetParams( | ||
repo_id="yelp_review_full", | ||
split="train[:8]", | ||
), | ||
# Specify HuggingFace Trainer parameters. | ||
trainer_parameters=HuggingFaceTrainerParams( | ||
training_parameters=transformers.TrainingArguments( | ||
output_dir="test_trainer", | ||
save_strategy="no", | ||
evaluation_strategy="no", | ||
do_eval=False, | ||
disable_tqdm=True, | ||
log_level="info", | ||
num_train_epochs=1, | ||
), | ||
# Set LoRA config to reduce number of trainable model parameters. | ||
lora_config=LoraConfig( | ||
r=8, | ||
lora_alpha=8, | ||
lora_dropout=0.1, | ||
bias="none", | ||
), | ||
), | ||
num_workers=1, # nodes parameter for torchrun command. | ||
num_procs_per_worker=1, # nproc-per-node parameter for torchrun command. | ||
resources_per_worker={ | ||
"gpu": 0, | ||
"cpu": 2, | ||
"memory": "10G", | ||
}, | ||
storage_config={ | ||
"size": "10Gi", | ||
"access_modes": ["ReadWriteOnce"], | ||
}, | ||
) | ||
|
||
logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") | ||
logging.info(TRAINING_CLIENT.list_jobs(job_namespace)) | ||
|
||
try: | ||
utils.verify_job_e2e(TRAINING_CLIENT, JOB_NAME, job_namespace, wait_timeout=900) | ||
except Exception as e: | ||
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) | ||
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) | ||
raise Exception(f"PyTorchJob create from function E2E fails. Exception: {e}") | ||
|
||
utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) | ||
TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) |