From ebebb469ccdb16093c5084b13f8c033e94fd55bd Mon Sep 17 00:00:00 2001 From: droctothorpe Date: Thu, 25 Jul 2024 15:44:18 -0400 Subject: [PATCH] Implement pre-commit hooks Signed-off-by: droctothorpe --- .github/workflows/pre-commit.yaml | 14 ++++++ .github/workflows/test-python.yaml | 9 ---- .pre-commit-config.yaml | 44 +++++++++++++++++++ docs/development/developer_guide.md | 40 ++++++++++++----- docs/diagrams/tfjob_k8s_resources.svg | 1 - docs/release/changelog.py | 3 +- examples/paddlepaddle/simple-gpu.yaml | 1 - examples/pytorch/README.md | 8 ++-- examples/pytorch/elastic/echo/echo.yaml | 1 - examples/pytorch/elastic/etcd.yaml | 2 +- .../pytorch/elastic/imagenet/.dockerignore | 2 +- examples/pytorch/elastic/imagenet/imagenet.py | 14 +++--- .../Train-CNN-with-FashionMNIST.ipynb | 2 +- examples/pytorch/mnist/Makefile | 6 +-- examples/pytorch/mnist/mnist.py | 5 ++- .../mnist/v1/pytorch_job_mnist_gloo.yaml | 2 +- .../mnist/v1/pytorch_job_mnist_mpi.yaml | 8 ++-- .../mnist/v1/pytorch_job_mnist_nccl.yaml | 6 +-- examples/pytorch/smoke-dist/README.md | 2 +- examples/pytorch/smoke-dist/dist_sendrecv.py | 8 ++-- examples/tensorflow/dist-mnist/README.md | 2 +- .../distribution_strategy/keras-API/README.md | 10 ++--- .../multi_worker_strategy-with-keras.py | 9 ++-- .../tensorflow/mnist_with_summaries/README.md | 2 +- .../mnist_with_summaries.py | 1 - .../mnist_with_summaries/tf_job_mnist.yaml | 6 +-- .../tfevent-volume/tfevent-pv.yaml | 2 +- .../tfevent-volume/tfevent-pvc.yaml | 2 +- examples/tensorflow/tf_sample/setup.py | 3 +- examples/tensorflow/tf_sample/tf_smoke.py | 2 +- examples/xgboost/lightgbm-dist/README.md | 2 +- examples/xgboost/lightgbm-dist/main.py | 3 +- .../xgboostjob_v1_lightgbm_dist_training.yaml | 1 - examples/xgboost/smoke-dist/README.md | 3 -- examples/xgboost/smoke-dist/tracker.py | 4 +- .../xgboost/smoke-dist/xgboost_smoke_test.py | 1 - .../smoke-dist/xgboostjob_v1_rabit_test.yaml | 1 - .../xgboostjob_v1alpha1_rabit_test.yaml | 1 - examples/xgboost/xgboost-dist/README.md | 16 +++---- examples/xgboost/xgboost-dist/local_test.py | 6 ++- examples/xgboost/xgboost-dist/predict.py | 4 +- examples/xgboost/xgboost-dist/tracker.py | 4 +- examples/xgboost/xgboost-dist/train.py | 4 +- examples/xgboost/xgboost-dist/utils.py | 8 ++-- .../xgboostjob_v1_iris_predict.yaml | 2 - .../xgboostjob_v1_iris_train.yaml | 2 - hack/python-sdk/post_gen.py | 2 +- sdk/python/Dockerfile.conformance | 2 +- sdk/python/conformance/run.sh | 2 +- .../abstract_dataset_provider.py | 3 +- .../abstract_model_provider.py | 3 +- .../storage_initializer/hugging_face.py | 17 +++---- sdk/python/kubeflow/storage_initializer/s3.py | 4 +- .../kubeflow/storage_initializer/storage.py | 4 +- .../kubeflow/trainer/hf_llm_training.py | 25 +++++------ .../kubeflow/training/api/training_client.py | 32 ++++++++------ .../training/api/training_client_test.py | 18 ++++---- .../kubeflow/training/constants/constants.py | 5 ++- sdk/python/kubeflow/training/utils/utils.py | 13 +++--- test_job/README.md | 6 +-- third_party/library/license.txt | 1 - 61 files changed, 242 insertions(+), 174 deletions(-) create mode 100644 .github/workflows/pre-commit.yaml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000000..2b11178bf9 --- /dev/null +++ b/.github/workflows/pre-commit.yaml @@ -0,0 +1,14 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/test-python.yaml b/.github/workflows/test-python.yaml index 3d6faae951..9a706461b7 100644 --- a/.github/workflows/test-python.yaml +++ b/.github/workflows/test-python.yaml @@ -26,15 +26,6 @@ jobs: with: python-version: ${{ matrix.python-version }} - # TODO (andreyvelich): We need to replace this action with script to do - # linting and formatting for Training Operator SDK. - - name: Check Python code with Black - uses: psf/black@stable - with: - version: 24.2.0 - options: --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py' - src: sdk/ - - name: Install dependencies run: | pip install pytest python-dateutil urllib3 kubernetes diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..89a40e2ea5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,44 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + args: [--allow-multiple-documents] + - id: check-json + - id: end-of-file-fixer + - id: trailing-whitespace + - repo: https://github.com/pycqa/isort + rev: 5.11.5 + hooks: + - id: isort + name: isort + entry: isort --profile google + - repo: https://github.com/psf/black + rev: 24.2.0 + hooks: + - id: black + files: sdk/.* + exclude: | + (?x)^( + /*kubeflow_org_v1*| + __init__.py| + api_client.py| + configuration.py| + exceptions.py| + rest.py + )$ +exclude: | + (?x)^( + pkg/apis/kubeflow.org/v1/openapi_generated.go| + pkg/apis/kubeflow.org/v1/zz_.*| + pkg/client/.*| + test_job/apis/test_job/v1/.*generated.*.go| + test_job/client/.*| + sdk/python/kubeflow/training/[^/]*.py| + sdk/python/kubeflow/training/models/.*| + sdk/python/test/.*| + docs/api/.*| + sdk/python/docs/.*| + sdk/python/.openapi-generator/VERSION| + sdk/python/kubeflow/__init__.py + )$ diff --git a/docs/development/developer_guide.md b/docs/development/developer_guide.md index 4b5808c6b7..3b48d898c7 100644 --- a/docs/development/developer_guide.md +++ b/docs/development/developer_guide.md @@ -5,7 +5,7 @@ Kubeflow Training Operator is currently at v1. ## Requirements - [Go](https://golang.org/) (1.22 or later) -- [Docker](https://docs.docker.com/) +- [Docker](https://docs.docker.com/) - [Docker](https://docs.docker.com/) (20.10 or later) - [Docker Buildx](https://docs.docker.com/build/buildx/) (0.8.0 or later) - [Python](https://www.python.org/) (3.11 or later) @@ -49,12 +49,12 @@ First, you need to run a Kubernetes cluster locally. We recommend [Kind](https:/ You can create a `kind` cluster by running ```sh -kind create cluster +kind create cluster ``` -This will load your kubernetes config file with the new cluster. +This will load your kubernetes config file with the new cluster. -After creating the cluster, you can check the nodes with the code below which should show you the kind-control-plane. -```sh +After creating the cluster, you can check the nodes with the code below which should show you the kind-control-plane. +```sh kubectl get nodes ``` The output should look something like below: @@ -74,9 +74,9 @@ Then we can patch it with the latest operator image. ```sh kubectl patch -n kubeflow deployments training-operator --type json -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "kubeflow/training-operator:latest"}]' ``` -Then we can run the job with the following command. +Then we can run the job with the following command. -```sh +```sh kubectl apply -f https://raw.githubusercontent.com/kubeflow/training-operator/master/examples/pytorch/simple.yaml ``` And we can see the output of the job from the logs, which may take some time to produce but should look something like below. @@ -116,10 +116,10 @@ make docker-build IMG=my-username/training-operator:my-pr-01 ``` You can swap `my-username/training-operator:my-pr-01` with whatever you would like. -## Load docker image +## Load docker image ```sh kind load docker-image my-username/training-operator:my-pr-01 -``` +``` ## Modify operator image with new one @@ -129,8 +129,8 @@ kustomize edit set image my-username/training-operator=my-username/training-oper ``` Update the `newTag` key in `./manifests/overlayes/standalone/kustimization.yaml` with the new image. -Deploy the operator with: -```sh +Deploy the operator with: +```sh kubectl apply -k ./manifests/overlays/standalone ``` And now we can submit jobs to the operator. @@ -140,7 +140,7 @@ kubectl apply -f https://raw.githubusercontent.com/kubeflow/training-operator/ma ``` You should be able to see a pod for your training operator running in your namespace using ``` -kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple +kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple ``` ## Go version @@ -187,3 +187,19 @@ sdk/python/kubeflow/training/api ```sh black --check --exclude '/*kubeflow_org_v1*|__init__.py|api_client.py|configuration.py|exceptions.py|rest.py' sdk/ ``` + +### pre-commit + +Make sure to install [pre-commit](https://pre-commit.com/) (`pip install +pre-commit`) and run `pre-commit install` from the root of the repository at +least once before creating git commits. + +The pre-commit [hooks](../.pre-commit-config.yaml) ensure code quality and +consistency. They are executed in CI. PRs that fail to comply with the hooks +will not be able to pass the corresponding CI gate. The hooks are only executed +against staged files unless you run `pre-commit run --all`, in which case, +they'll be executed against every file in the repository. + +Specific programmatically generated files listed in the `exclude` field in +[.pre-commit-config.yaml](../.pre-commit-config.yaml) are deliberately excluded +from the hooks. diff --git a/docs/diagrams/tfjob_k8s_resources.svg b/docs/diagrams/tfjob_k8s_resources.svg index b0eda947bd..17dbe8a8cc 100644 --- a/docs/diagrams/tfjob_k8s_resources.svg +++ b/docs/diagrams/tfjob_k8s_resources.svg @@ -1,4 +1,3 @@ - diff --git a/docs/release/changelog.py b/docs/release/changelog.py index ac508d025f..b9eeeb7137 100644 --- a/docs/release/changelog.py +++ b/docs/release/changelog.py @@ -1,6 +1,7 @@ -from github import Github import argparse +from github import Github + REPO_NAME = "kubeflow/training-operator" CHANGELOG_FILE = "CHANGELOG.md" diff --git a/examples/paddlepaddle/simple-gpu.yaml b/examples/paddlepaddle/simple-gpu.yaml index a97191b05d..e726536bd7 100644 --- a/examples/paddlepaddle/simple-gpu.yaml +++ b/examples/paddlepaddle/simple-gpu.yaml @@ -33,4 +33,3 @@ spec: - name: dshm emptyDir: medium: Memory - diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md index cd67e750f7..0db74af091 100644 --- a/examples/pytorch/README.md +++ b/examples/pytorch/README.md @@ -1,5 +1,5 @@ -## Installation & deployment tips -1. You need to configure your node to utilize GPU. This can be done the following way: +## Installation & deployment tips +1. You need to configure your node to utilize GPU. This can be done the following way: * Install [nvidia-docker2](https://github.com/NVIDIA/nvidia-docker) * Connect to your MasterNode and set nvidia as the default run in `/etc/docker/daemon.json`: ``` @@ -13,11 +13,11 @@ } } ``` - * After that deploy nvidia-daemon to kubernetes: + * After that deploy nvidia-daemon to kubernetes: ```bash kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml ``` - + 2. NVIDIA GPUs can now be consumed via container level resource requirements using the resource name nvidia.com/gpu: ``` resources: diff --git a/examples/pytorch/elastic/echo/echo.yaml b/examples/pytorch/elastic/echo/echo.yaml index 6df8fe415f..ee51ec8418 100644 --- a/examples/pytorch/elastic/echo/echo.yaml +++ b/examples/pytorch/elastic/echo/echo.yaml @@ -26,4 +26,3 @@ spec: - torch.distributed.run - --rdzv_backend=c10d - ./echo.py - diff --git a/examples/pytorch/elastic/etcd.yaml b/examples/pytorch/elastic/etcd.yaml index a158187901..edb3bb1e9d 100644 --- a/examples/pytorch/elastic/etcd.yaml +++ b/examples/pytorch/elastic/etcd.yaml @@ -71,4 +71,4 @@ spec: protocol: TCP targetPort: 2380 selector: - etcd_node: etcd-server \ No newline at end of file + etcd_node: etcd-server diff --git a/examples/pytorch/elastic/imagenet/.dockerignore b/examples/pytorch/elastic/imagenet/.dockerignore index 6320cd248d..1269488f7f 100644 --- a/examples/pytorch/elastic/imagenet/.dockerignore +++ b/examples/pytorch/elastic/imagenet/.dockerignore @@ -1 +1 @@ -data \ No newline at end of file +data diff --git a/examples/pytorch/elastic/imagenet/imagenet.py b/examples/pytorch/elastic/imagenet/imagenet.py index dcab82c8ce..00faa73678 100644 --- a/examples/pytorch/elastic/imagenet/imagenet.py +++ b/examples/pytorch/elastic/imagenet/imagenet.py @@ -44,30 +44,30 @@ """ import argparse +from contextlib import contextmanager +from datetime import timedelta import io import os import shutil import time -from contextlib import contextmanager -from datetime import timedelta from typing import List, Tuple import numpy import torch import torch.distributed as dist +from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed.elastic.utils.data import ElasticDistributedSampler import torch.nn as nn import torch.nn.parallel +from torch.nn.parallel import DistributedDataParallel import torch.optim +from torch.optim import SGD import torch.utils.data +from torch.utils.data import DataLoader import torch.utils.data.distributed import torchvision.datasets as datasets import torchvision.models as models import torchvision.transforms as transforms -from torch.distributed.elastic.multiprocessing.errors import record -from torch.distributed.elastic.utils.data import ElasticDistributedSampler -from torch.nn.parallel import DistributedDataParallel -from torch.optim import SGD -from torch.utils.data import DataLoader model_names = sorted( name diff --git a/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb b/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb index 7c040e6a0d..7cb5e8fba9 100644 --- a/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb +++ b/examples/pytorch/image-classification/Train-CNN-with-FashionMNIST.ipynb @@ -696,4 +696,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/pytorch/mnist/Makefile b/examples/pytorch/mnist/Makefile index c724caf0fb..4d1ffe50f6 100644 --- a/examples/pytorch/mnist/Makefile +++ b/examples/pytorch/mnist/Makefile @@ -19,7 +19,7 @@ IMG = gcr.io/kubeflow-examples/pytorch-dist-mnist PUBLIC = gcr.io/kubeflow-examples/pytorch-dist-mnist DIR := ${CURDIR} -# List any changed files. +# List any changed files. CHANGED_FILES := $(shell git diff-files --relative=examples/dist-mnist) ifeq ($(strip $(CHANGED_FILES)),) @@ -43,9 +43,9 @@ build: # Build but don't attach the latest tag. This allows manual testing/inspection of the image # first. push: build - gcloud docker -- push $(IMG):$(TAG) + gcloud docker -- push $(IMG):$(TAG) @echo Pushed $(IMG) with :$(TAG) tags - + push-latest: push gcloud container images add-tag --quiet $(IMG):$(TAG) $(IMG):latest --verbosity=info echo created $(IMG):latest diff --git a/examples/pytorch/mnist/mnist.py b/examples/pytorch/mnist/mnist.py index 4ccd051999..3e8eed2ce2 100644 --- a/examples/pytorch/mnist/mnist.py +++ b/examples/pytorch/mnist/mnist.py @@ -3,14 +3,15 @@ import argparse import os +from tensorboardX import SummaryWriter import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim -from tensorboardX import SummaryWriter from torch.utils.data import DistributedSampler -from torchvision import datasets, transforms +from torchvision import datasets +from torchvision import transforms class Net(nn.Module): diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml index eddfa2d9cc..3e42a2685f 100644 --- a/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml +++ b/examples/pytorch/mnist/v1/pytorch_job_mnist_gloo.yaml @@ -22,7 +22,7 @@ spec: restartPolicy: OnFailure template: spec: - containers: + containers: - name: pytorch image: kubeflow/pytorch-dist-mnist:latest args: ["--backend", "gloo"] diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml index fdc090fc53..53b8da80ea 100644 --- a/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml +++ b/examples/pytorch/mnist/v1/pytorch_job_mnist_mpi.yaml @@ -14,19 +14,19 @@ spec: image: kubeflow/pytorch-dist-mnist:latest args: ["--backend", "mpi"] # Comment out the below resources to use the CPU. - resources: + resources: limits: nvidia.com/gpu: 1 Worker: replicas: 1 - restartPolicy: OnFailure + restartPolicy: OnFailure template: spec: - containers: + containers: - name: pytorch image: kubeflow/pytorch-dist-mnist:latest args: ["--backend", "mpi"] # Comment out the below resources to use the CPU. - resources: + resources: limits: nvidia.com/gpu: 1 diff --git a/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml b/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml index e3b263902b..0807abe32f 100644 --- a/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml +++ b/examples/pytorch/mnist/v1/pytorch_job_mnist_nccl.yaml @@ -13,7 +13,7 @@ spec: - name: pytorch image: kubeflow/pytorch-dist-mnist:latest args: ["--backend", "nccl"] - resources: + resources: limits: nvidia.com/gpu: 1 Worker: @@ -21,10 +21,10 @@ spec: restartPolicy: OnFailure template: spec: - containers: + containers: - name: pytorch image: kubeflow/pytorch-dist-mnist:latest args: ["--backend", "nccl"] - resources: + resources: limits: nvidia.com/gpu: 1 diff --git a/examples/pytorch/smoke-dist/README.md b/examples/pytorch/smoke-dist/README.md index 60d77ae4aa..ccfc1c928c 100644 --- a/examples/pytorch/smoke-dist/README.md +++ b/examples/pytorch/smoke-dist/README.md @@ -1,4 +1,4 @@ -### Distributed send/recv e2e test +### Distributed send/recv e2e test This folder containers Dockerfile and distributed send/recv test. diff --git a/examples/pytorch/smoke-dist/dist_sendrecv.py b/examples/pytorch/smoke-dist/dist_sendrecv.py index e4c7e35eae..e78d798e0d 100644 --- a/examples/pytorch/smoke-dist/dist_sendrecv.py +++ b/examples/pytorch/smoke-dist/dist_sendrecv.py @@ -33,19 +33,19 @@ def init_processes(fn, backend='gloo'): def main(): logging.info("Torch version: %s", torch.__version__) - + port = os.environ.get("MASTER_PORT", "{}") logging.info("MASTER_PORT: %s", port) - + addr = os.environ.get("MASTER_ADDR", "{}") logging.info("MASTER_ADDR: %s", addr) world_size = os.environ.get("WORLD_SIZE", "{}") logging.info("WORLD_SIZE: %s", world_size) - + rank = os.environ.get("RANK", "{}") logging.info("RANK: %s", rank) - + init_processes(run) diff --git a/examples/tensorflow/dist-mnist/README.md b/examples/tensorflow/dist-mnist/README.md index 80e37de649..4d3f842850 100644 --- a/examples/tensorflow/dist-mnist/README.md +++ b/examples/tensorflow/dist-mnist/README.md @@ -20,4 +20,4 @@ docker build -f Dockerfile.ppc64le -t kubeflow123/tf-dist-mnist-test:1.0 ./ ``` kubectl create -f ./tf_job_mnist.yaml ``` - * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. \ No newline at end of file + * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. diff --git a/examples/tensorflow/distribution_strategy/keras-API/README.md b/examples/tensorflow/distribution_strategy/keras-API/README.md index 70b58421e0..25d1ddbf09 100644 --- a/examples/tensorflow/distribution_strategy/keras-API/README.md +++ b/examples/tensorflow/distribution_strategy/keras-API/README.md @@ -1,12 +1,12 @@ # Multi-worker training with Keras -This directory contains a example for running multi-worker distributed training -using Tensorflow 2.1 keras API on Kubeflow. For more information about the +This directory contains a example for running multi-worker distributed training +using Tensorflow 2.1 keras API on Kubeflow. For more information about the source code, please see TensorFlow tutorials [here](https://www.tensorflow.org/tutorials/distribute/keras) and [here](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras) ## Prerequisite -Your cluster must be configured to use Multiple GPUs, +Your cluster must be configured to use Multiple GPUs, please follow the [instructions](https://www.kubeflow.org/docs/components/training/tftraining/#using-gpus) ## Steps @@ -16,13 +16,13 @@ please follow the [instructions](https://www.kubeflow.org/docs/components/traini docker build -f Dockerfile -t kubeflow/multi_worker_strategy:v1.0 . ``` -2. Specify your storageClassName and create a persistent volume claim to save +2. Specify your storageClassName and create a persistent volume claim to save models and checkpoints ``` kubectl -n ${NAMESPACE} create -f pvc.yaml ``` -3. Create a TFJob, if you use some GPUs other than NVIDIA, please replace +3. Create a TFJob, if you use some GPUs other than NVIDIA, please replace `nvidia.com/gpu` with your GPU vendor in the `limits` section. ``` kubectl -n ${NAMESPACE} create -f multi_worker_tfjob.yaml diff --git a/examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py b/examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py index 9a1b9a71cb..f7160d9350 100644 --- a/examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py +++ b/examples/tensorflow/distribution_strategy/keras-API/multi_worker_strategy-with-keras.py @@ -14,15 +14,18 @@ # ============================================================================== """An example of multi-worker training with Keras model using Strategy API.""" -from __future__ import absolute_import, division, print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function import argparse import json import os -import tensorflow_datasets as tfds import tensorflow as tf -from tensorflow.keras import layers, models +from tensorflow.keras import layers +from tensorflow.keras import models +import tensorflow_datasets as tfds def make_datasets_unbatched(): diff --git a/examples/tensorflow/mnist_with_summaries/README.md b/examples/tensorflow/mnist_with_summaries/README.md index 075e87ce60..ddef953fc1 100644 --- a/examples/tensorflow/mnist_with_summaries/README.md +++ b/examples/tensorflow/mnist_with_summaries/README.md @@ -18,4 +18,4 @@ docker build -f Dockerfile.ppc64le -t kubeflow123/tf-mnist-with-summaries:1.0 ./ Usage: 1. Add the persistent volume and claim: `kubectl apply -f tfevent-volume/.` 1. Deploy the TFJob: `kubectl apply -f tf_job_mnist.yaml` - * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. \ No newline at end of file + * If on ppc64le, please update tf_job_mnist.yaml to use the image of ppc64le firstly. diff --git a/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py b/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py index 65d3b12233..85ca03c986 100644 --- a/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py +++ b/examples/tensorflow/mnist_with_summaries/mnist_with_summaries.py @@ -27,7 +27,6 @@ import sys import tensorflow as tf - from tensorflow.examples.tutorials.mnist import input_data FLAGS = None diff --git a/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml b/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml index 88e0e94848..3c1884a05c 100644 --- a/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml +++ b/examples/tensorflow/mnist_with_summaries/tf_job_mnist.yaml @@ -2,13 +2,13 @@ apiVersion: "kubeflow.org/v1" kind: "TFJob" metadata: name: "mnist" - namespace: kubeflow + namespace: kubeflow spec: runPolicy: cleanPodPolicy: None tfReplicaSpecs: Worker: - replicas: 1 + replicas: 1 restartPolicy: Never template: spec: @@ -27,4 +27,4 @@ spec: volumes: - name: "training" persistentVolumeClaim: - claimName: "tfevent-volume" + claimName: "tfevent-volume" diff --git a/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml b/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml index a450c6a492..cf41c6f982 100644 --- a/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml +++ b/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pv.yaml @@ -8,7 +8,7 @@ metadata: spec: capacity: storage: 10Gi - storageClassName: standard + storageClassName: standard accessModes: - ReadWriteMany hostPath: diff --git a/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml b/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml index 7d7f8487a1..6bab17d803 100644 --- a/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml +++ b/examples/tensorflow/mnist_with_summaries/tfevent-volume/tfevent-pvc.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: name: tfevent-volume - namespace: kubeflow + namespace: kubeflow labels: type: local app: tfjob diff --git a/examples/tensorflow/tf_sample/setup.py b/examples/tensorflow/tf_sample/setup.py index bde533c8bd..9cd150e665 100644 --- a/examples/tensorflow/tf_sample/setup.py +++ b/examples/tensorflow/tf_sample/setup.py @@ -10,7 +10,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """A setup.py file for the tf_sample package.""" -from setuptools import find_packages, setup +from setuptools import find_packages +from setuptools import setup REQUIRED_PACKAGES = [] diff --git a/examples/tensorflow/tf_sample/tf_smoke.py b/examples/tensorflow/tf_sample/tf_smoke.py index d0b5eeb60d..eaec04f8ed 100644 --- a/examples/tensorflow/tf_sample/tf_smoke.py +++ b/examples/tensorflow/tf_sample/tf_smoke.py @@ -11,8 +11,8 @@ import json import logging import os -import retrying +import retrying import tensorflow as tf diff --git a/examples/xgboost/lightgbm-dist/README.md b/examples/xgboost/lightgbm-dist/README.md index 79d9334f16..616425f3cf 100644 --- a/examples/xgboost/lightgbm-dist/README.md +++ b/examples/xgboost/lightgbm-dist/README.md @@ -200,4 +200,4 @@ status: succeeded: 1 Worker: succeeded: 2 -``` \ No newline at end of file +``` diff --git a/examples/xgboost/lightgbm-dist/main.py b/examples/xgboost/lightgbm-dist/main.py index 25a96055f5..e9ba951957 100644 --- a/examples/xgboost/lightgbm-dist/main.py +++ b/examples/xgboost/lightgbm-dist/main.py @@ -15,7 +15,8 @@ import os from train import train -from utils import generate_machine_list_file, generate_train_conf_file +from utils import generate_machine_list_file +from utils import generate_train_conf_file logger = logging.getLogger(__name__) diff --git a/examples/xgboost/lightgbm-dist/xgboostjob_v1_lightgbm_dist_training.yaml b/examples/xgboost/lightgbm-dist/xgboostjob_v1_lightgbm_dist_training.yaml index c34a7cbf92..35487b1511 100644 --- a/examples/xgboost/lightgbm-dist/xgboostjob_v1_lightgbm_dist_training.yaml +++ b/examples/xgboost/lightgbm-dist/xgboostjob_v1_lightgbm_dist_training.yaml @@ -72,4 +72,3 @@ spec: - --is_enable_sparse=true - --use_two_round_loading=false - --is_save_binary_file=false - diff --git a/examples/xgboost/smoke-dist/README.md b/examples/xgboost/smoke-dist/README.md index e04afb5a5c..c0ce8e6cc2 100644 --- a/examples/xgboost/smoke-dist/README.md +++ b/examples/xgboost/smoke-dist/README.md @@ -85,6 +85,3 @@ status: Worker: succeeded: 2 ``` - - - diff --git a/examples/xgboost/smoke-dist/tracker.py b/examples/xgboost/smoke-dist/tracker.py index 1074f594ae..73d4e06b55 100644 --- a/examples/xgboost/smoke-dist/tracker.py +++ b/examples/xgboost/smoke-dist/tracker.py @@ -24,8 +24,8 @@ import struct import subprocess import sys -import time from threading import Thread +import time class ExSocket(object): @@ -503,4 +503,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/xgboost/smoke-dist/xgboost_smoke_test.py b/examples/xgboost/smoke-dist/xgboost_smoke_test.py index 8ca4476b13..5f7a0071da 100644 --- a/examples/xgboost/smoke-dist/xgboost_smoke_test.py +++ b/examples/xgboost/smoke-dist/xgboost_smoke_test.py @@ -18,7 +18,6 @@ import traceback from tracker import RabitTracker - import xgboost as xgb logger = logging.getLogger(__name__) diff --git a/examples/xgboost/smoke-dist/xgboostjob_v1_rabit_test.yaml b/examples/xgboost/smoke-dist/xgboostjob_v1_rabit_test.yaml index 15f55a01b1..80ce3adce1 100644 --- a/examples/xgboost/smoke-dist/xgboostjob_v1_rabit_test.yaml +++ b/examples/xgboost/smoke-dist/xgboostjob_v1_rabit_test.yaml @@ -28,4 +28,3 @@ spec: - containerPort: 9991 name: xgboostjob-port imagePullPolicy: Always - diff --git a/examples/xgboost/smoke-dist/xgboostjob_v1alpha1_rabit_test.yaml b/examples/xgboost/smoke-dist/xgboostjob_v1alpha1_rabit_test.yaml index 24fe7526fc..5364eb07cd 100644 --- a/examples/xgboost/smoke-dist/xgboostjob_v1alpha1_rabit_test.yaml +++ b/examples/xgboost/smoke-dist/xgboostjob_v1alpha1_rabit_test.yaml @@ -32,4 +32,3 @@ spec: - containerPort: 9991 name: xgboostjob-port imagePullPolicy: Always - diff --git a/examples/xgboost/xgboost-dist/README.md b/examples/xgboost/xgboost-dist/README.md index 506f6230ec..4234e3ebbc 100644 --- a/examples/xgboost/xgboost-dist/README.md +++ b/examples/xgboost/xgboost-dist/README.md @@ -9,10 +9,10 @@ User can extend provided data reader to read data from distributed data storage **Configure the job runtime via Yaml file** The following files are available to setup distributed XGBoost computation runtime - + To store the model in OSS: -* xgboostjob_v1_iris_train.yaml +* xgboostjob_v1_iris_train.yaml * xgboostjob_v1_iris_predict.yaml To store the model in local path: @@ -160,7 +160,7 @@ Kind: XGBoostJob Metadata: Creation Timestamp: 2019-06-27T06:06:53Z Generation: 8 - Resource Version: 394523 + Resource Version: 394523 UID: c2a04cbc-98a1-11e9-bbab-080027dfbfe2 Spec: Run Policy: @@ -249,8 +249,8 @@ Events: **Start the distributed XGBoost train to store the model locally** Before proceeding with training we will create a PVC to store the model trained. -Creating pvc : -create a yaml file with the below content +Creating pvc : +create a yaml file with the below content pvc.yaml ``` apiVersion: v1 @@ -268,13 +268,13 @@ spec: ``` kubectl create -f pvc.yaml ``` -Note: +Note: * Please use the storage class which supports ReadWriteMany. The example yaml above uses glusterfs * Mention model_storage_type=local and model_path accordingly( In the example /tmp/xgboost_model/2 is used ) in xgboostjob_v1_iris_train_local.yaml and xgboostjob_v1_iris_predict_local.yaml" -Now start the distributed XGBoost train. +Now start the distributed XGBoost train. ``` kubectl create -f xgboostjob_v1_iris_train_local.yaml ``` @@ -382,7 +382,7 @@ status: Master: succeeded: 1 Worker: - succeeded: 2 + succeeded: 2 ``` **Start the distributed XGBoost job predict** ``` diff --git a/examples/xgboost/xgboost-dist/local_test.py b/examples/xgboost/xgboost-dist/local_test.py index 17a80f2c61..4d4a321cb2 100644 --- a/examples/xgboost/xgboost-dist/local_test.py +++ b/examples/xgboost/xgboost-dist/local_test.py @@ -18,8 +18,10 @@ import numpy as np from sklearn.metrics import precision_score -from utils import dump_model, read_model, read_predict_data, read_train_data - +from utils import dump_model +from utils import read_model +from utils import read_predict_data +from utils import read_train_data import xgboost as xgb logger = logging.getLogger(__name__) diff --git a/examples/xgboost/xgboost-dist/predict.py b/examples/xgboost/xgboost-dist/predict.py index 91dd29cc11..24cafeefe0 100644 --- a/examples/xgboost/xgboost-dist/predict.py +++ b/examples/xgboost/xgboost-dist/predict.py @@ -14,7 +14,9 @@ import numpy as np from sklearn.metrics import precision_score -from utils import extract_xgbooost_cluster_env, read_model, read_predict_data +from utils import extract_xgbooost_cluster_env +from utils import read_model +from utils import read_predict_data def predict(args): diff --git a/examples/xgboost/xgboost-dist/tracker.py b/examples/xgboost/xgboost-dist/tracker.py index 1074f594ae..73d4e06b55 100644 --- a/examples/xgboost/xgboost-dist/tracker.py +++ b/examples/xgboost/xgboost-dist/tracker.py @@ -24,8 +24,8 @@ import struct import subprocess import sys -import time from threading import Thread +import time class ExSocket(object): @@ -503,4 +503,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/examples/xgboost/xgboost-dist/train.py b/examples/xgboost/xgboost-dist/train.py index f1da6a2cf5..896959517b 100644 --- a/examples/xgboost/xgboost-dist/train.py +++ b/examples/xgboost/xgboost-dist/train.py @@ -15,8 +15,8 @@ import traceback from tracker import RabitTracker -from utils import extract_xgbooost_cluster_env, read_train_data - +from utils import extract_xgbooost_cluster_env +from utils import read_train_data import xgboost as xgb logger = logging.getLogger(__name__) diff --git a/examples/xgboost/xgboost-dist/utils.py b/examples/xgboost/xgboost-dist/utils.py index c9ac7ddfa2..d605cca93f 100644 --- a/examples/xgboost/xgboost-dist/utils.py +++ b/examples/xgboost/xgboost-dist/utils.py @@ -19,7 +19,6 @@ import oss2 import pandas as pd from sklearn import datasets - import xgboost as xgb logger = logging.getLogger(__name__) @@ -142,7 +141,7 @@ def dump_model(model, type, model_path, args): if oss_param is None: raise Exception("Please config oss parameter to store model") - oss_param['path'] = args.model_path + oss_param['path'] = args.model_path dump_model_to_oss(oss_param, model) logging.info("Dump model into oss place %s", args.model_path) @@ -168,7 +167,7 @@ def read_model(type, model_path, args): raise Exception("Please config oss to read model") return False - oss_param['path'] = args.model_path + oss_param['path'] = args.model_path model = read_model_from_oss(oss_param) logging.info("read model from oss place %s", model_path) @@ -283,10 +282,9 @@ def parse_parameters(input, splitter_between, splitter_in): conf = kv.split(splitter_in) key = conf[0].strip(" ") if key == "objective" or key == "endpoint": - value = conf[1].strip("'") + ":" + conf[2].strip("'") + value = conf[1].strip("'") + ":" + conf[2].strip("'") else: value = conf[1] confs[key] = value return confs - diff --git a/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_predict.yaml b/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_predict.yaml index 218219c818..9f0c773b1f 100644 --- a/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_predict.yaml +++ b/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_predict.yaml @@ -38,5 +38,3 @@ spec: - --model_path=autoAI/xgb-opt/2 - --model_storage_type=oss - --oss_param=unknown - - diff --git a/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_train.yaml b/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_train.yaml index 3600a0a7f2..5824ec056d 100644 --- a/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_train.yaml +++ b/examples/xgboost/xgboost-dist/xgboostjob_v1_iris_train.yaml @@ -40,5 +40,3 @@ spec: - --xgboost_parameter="objective:multi:softprob,num_class:3" - --n_estimators=10 - --learning_rate=0.1 - - diff --git a/hack/python-sdk/post_gen.py b/hack/python-sdk/post_gen.py index 07b15a4468..c54767ba79 100755 --- a/hack/python-sdk/post_gen.py +++ b/hack/python-sdk/post_gen.py @@ -18,8 +18,8 @@ This script is used for updating generated SDK files. """ -import os import fileinput +import os import re __replacements = [ diff --git a/sdk/python/Dockerfile.conformance b/sdk/python/Dockerfile.conformance index 4cdc6f3bd6..2af8a23fbe 100644 --- a/sdk/python/Dockerfile.conformance +++ b/sdk/python/Dockerfile.conformance @@ -27,4 +27,4 @@ RUN chmod +x run.sh RUN pip install pytest RUN python -m pip install -e . -ENTRYPOINT [ "./run.sh" ] \ No newline at end of file +ENTRYPOINT [ "./run.sh" ] diff --git a/sdk/python/conformance/run.sh b/sdk/python/conformance/run.sh index 815720a19f..a73d998a9b 100644 --- a/sdk/python/conformance/run.sh +++ b/sdk/python/conformance/run.sh @@ -8,4 +8,4 @@ touch /tmp/training-operator-conformance.done echo "Done..." # Keep the container running so the test logs can be downloaded. -while true; do sleep 10000; done \ No newline at end of file +while true; do sleep 10000; done diff --git a/sdk/python/kubeflow/storage_initializer/abstract_dataset_provider.py b/sdk/python/kubeflow/storage_initializer/abstract_dataset_provider.py index 3f75faf0a2..bdae751205 100644 --- a/sdk/python/kubeflow/storage_initializer/abstract_dataset_provider.py +++ b/sdk/python/kubeflow/storage_initializer/abstract_dataset_provider.py @@ -1,4 +1,5 @@ -from abc import ABC, abstractmethod +from abc import ABC +from abc import abstractmethod class datasetProvider(ABC): diff --git a/sdk/python/kubeflow/storage_initializer/abstract_model_provider.py b/sdk/python/kubeflow/storage_initializer/abstract_model_provider.py index 392478a346..88c9c2af4d 100644 --- a/sdk/python/kubeflow/storage_initializer/abstract_model_provider.py +++ b/sdk/python/kubeflow/storage_initializer/abstract_model_provider.py @@ -1,4 +1,5 @@ -from abc import ABC, abstractmethod +from abc import ABC +from abc import abstractmethod class modelProvider(ABC): diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index 4b5b0794a9..0fbf511350 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -1,16 +1,17 @@ -import logging +from dataclasses import dataclass +from dataclasses import field import json -from typing import Union, Optional -from dataclasses import dataclass, field +import logging +from typing import Optional, Union from urllib.parse import urlparse -import transformers from peft import LoraConfig +import transformers -from .constants import VOLUME_PATH_DATASET, VOLUME_PATH_MODEL -from .abstract_model_provider import modelProvider from .abstract_dataset_provider import datasetProvider - +from .abstract_model_provider import modelProvider +from .constants import VOLUME_PATH_DATASET +from .constants import VOLUME_PATH_MODEL TRANSFORMER_TYPES = Union[ transformers.AutoModelForSequenceClassification, @@ -96,8 +97,8 @@ def load_config(self, serialised_args): def download_dataset(self): logger.info("Downloading dataset") logger.info("-" * 40) - import huggingface_hub from datasets import load_dataset + import huggingface_hub if self.config.access_token: huggingface_hub.login(self.config.access_token) diff --git a/sdk/python/kubeflow/storage_initializer/s3.py b/sdk/python/kubeflow/storage_initializer/s3.py index 5f60bbc72d..8dbdc7ef4c 100644 --- a/sdk/python/kubeflow/storage_initializer/s3.py +++ b/sdk/python/kubeflow/storage_initializer/s3.py @@ -1,7 +1,9 @@ -from dataclasses import dataclass, field +from dataclasses import dataclass +from dataclasses import field import json import os from urllib.parse import urlparse + from .abstract_dataset_provider import datasetProvider from .constants import VOLUME_PATH_DATASET diff --git a/sdk/python/kubeflow/storage_initializer/storage.py b/sdk/python/kubeflow/storage_initializer/storage.py index f65d9d324c..b1d59f662b 100644 --- a/sdk/python/kubeflow/storage_initializer/storage.py +++ b/sdk/python/kubeflow/storage_initializer/storage.py @@ -1,5 +1,7 @@ import argparse -from .hugging_face import HuggingFace, HuggingFaceDataset + +from .hugging_face import HuggingFace +from .hugging_face import HuggingFaceDataset from .s3 import S3 diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 26dd4fbe0e..e7ad3a0c5f 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -1,22 +1,21 @@ import argparse -import logging -from urllib.parse import urlparse import json +import logging import os +from urllib.parse import urlparse -from datasets import load_from_disk, Dataset +from datasets import Dataset +from datasets import load_from_disk from datasets.distributed import split_dataset_by_node -from peft import LoraConfig, get_peft_model +from peft import get_peft_model +from peft import LoraConfig import transformers -from transformers import ( - AutoModelForCausalLM, - AutoTokenizer, - AutoModelForImageClassification, - TrainingArguments, - DataCollatorForLanguageModeling, - Trainer, -) - +from transformers import AutoModelForCausalLM +from transformers import AutoModelForImageClassification +from transformers import AutoTokenizer +from transformers import DataCollatorForLanguageModeling +from transformers import Trainer +from transformers import TrainingArguments # Configure logger. log_formatter = logging.Formatter( diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index 972b26829e..dbf27c6f3e 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -12,23 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import multiprocessing -import logging -import time import json -from typing import Optional, Callable, Tuple, List, Dict, Any, Set, Union +import logging +import multiprocessing import queue -from kubernetes import client, config, watch +import time +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from kubeflow.storage_initializer.constants import VOLUME_PATH_DATASET +from kubeflow.storage_initializer.constants import VOLUME_PATH_MODEL from kubeflow.training import models from kubeflow.training.api_client import ApiClient from kubeflow.training.constants import constants from kubeflow.training.utils import utils -from kubeflow.storage_initializer.constants import ( - VOLUME_PATH_DATASET, - VOLUME_PATH_MODEL, -) - +from kubernetes import client +from kubernetes import config +from kubernetes import watch logger = logging.getLogger(__name__) @@ -176,11 +175,16 @@ def train( "Train API dependencies not installed. " + "Run: pip install -U 'kubeflow-training[huggingface]' " ) + + # fmt: off + + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceDatasetParams + from kubeflow.storage_initializer.hugging_face import \ + HuggingFaceModelParams from kubeflow.storage_initializer.s3 import S3DatasetParams - from kubeflow.storage_initializer.hugging_face import ( - HuggingFaceModelParams, - HuggingFaceDatasetParams, - ) + + # fmt: on print( "Thank you for using `train` API for LLMs fine-tuning. This feature is in alpha stage " diff --git a/sdk/python/kubeflow/training/api/training_client_test.py b/sdk/python/kubeflow/training/api/training_client_test.py index 04187ac20c..90ae04637f 100644 --- a/sdk/python/kubeflow/training/api/training_client_test.py +++ b/sdk/python/kubeflow/training/api/training_client_test.py @@ -1,21 +1,21 @@ import multiprocessing -import pytest -from unittest.mock import patch, Mock - from typing import Optional -from kubeflow.training import TrainingClient -from kubeflow.training import KubeflowOrgV1ReplicaSpec +from unittest.mock import Mock +from unittest.mock import patch + +from kubeflow.training import constants from kubeflow.training import KubeflowOrgV1PyTorchJob from kubeflow.training import KubeflowOrgV1PyTorchJobSpec +from kubeflow.training import KubeflowOrgV1ReplicaSpec from kubeflow.training import KubeflowOrgV1RunPolicy from kubeflow.training import KubeflowOrgV1SchedulingPolicy -from kubeflow.training import constants - -from kubernetes.client import V1PodTemplateSpec +from kubeflow.training import TrainingClient +from kubernetes.client import V1Container from kubernetes.client import V1ObjectMeta from kubernetes.client import V1PodSpec -from kubernetes.client import V1Container +from kubernetes.client import V1PodTemplateSpec from kubernetes.client import V1ResourceRequirements +import pytest LIST_RESPONSE = [{"metadata": {"name": "Dummy V1PodList"}}] TEST_NAME = "test" diff --git a/sdk/python/kubeflow/training/constants/constants.py b/sdk/python/kubeflow/training/constants/constants.py index 506edd267a..e258105e0f 100644 --- a/sdk/python/kubeflow/training/constants/constants.py +++ b/sdk/python/kubeflow/training/constants/constants.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from kubeflow.training import models -from typing import Union, Dict +from typing import Dict, Union + from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH +from kubeflow.training import models # How long to wait in seconds for requests to the Kubernetes API Server. DEFAULT_TIMEOUT = 120 diff --git a/sdk/python/kubeflow/training/utils/utils.py b/sdk/python/kubeflow/training/utils/utils.py index 04665951de..0c112c9003 100644 --- a/sdk/python/kubeflow/training/utils/utils.py +++ b/sdk/python/kubeflow/training/utils/utils.py @@ -13,18 +13,17 @@ # limitations under the License. from datetime import datetime -import os -import logging -import textwrap import inspect -from typing import Optional, Callable, List, Dict, Any, Tuple, Union import json -import threading +import logging +import os import queue +import textwrap +import threading +from typing import Any, Callable, Dict, List, Optional, Tuple, Union -from kubeflow.training.constants import constants from kubeflow.training import models - +from kubeflow.training.constants import constants logger = logging.getLogger(__name__) diff --git a/test_job/README.md b/test_job/README.md index e1055a97b6..e92ab334d2 100644 --- a/test_job/README.md +++ b/test_job/README.md @@ -1,9 +1,9 @@ ## Test Job Controller -This is a Test Job Controller example. As you can see, we have job crd definition under `apis/test_job/v1`. +This is a Test Job Controller example. As you can see, we have job crd definition under `apis/test_job/v1`. [code-generator](https://github.com/kubernetes/code-generator) generate deepcopy, clientset and other libraries. -`controler.v1/test_job/test_job_controller` defines a struct `TestJobController` which implements [commonv1.ControllerInterface](../pkg/apis/common/v1/interface.go) +`controler.v1/test_job/test_job_controller` defines a struct `TestJobController` which implements [commonv1.ControllerInterface](../pkg/apis/common/v1/interface.go) ```yaml ├── README.md @@ -26,4 +26,4 @@ This is a Test Job Controller example. As you can see, we have job crd definitio │   └── test_job │   └── test_job_controller.go └── test_util -``` \ No newline at end of file +``` diff --git a/third_party/library/license.txt b/third_party/library/license.txt index 5623527697..865d93aab1 100644 --- a/third_party/library/license.txt +++ b/third_party/library/license.txt @@ -7037,4 +7037,3 @@ kubernetes/kubernetes Apache License 2.0 https://github.com/kubernetes/kuberne WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -