From 11c0f4d2b3c89490b8b422295549fc976dab95ec Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Thu, 11 Jan 2024 17:53:06 +0000 Subject: [PATCH] Add Kubeflow Website links to README --- README.md | 23 +++++++----- docs/quick-start-v1.md | 82 ------------------------------------------ 2 files changed, 15 insertions(+), 90 deletions(-) delete mode 100644 docs/quick-start-v1.md diff --git a/README.md b/README.md index 2dee21f2b1..9fc735ef54 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,9 @@ ## Overview -Starting from v1.3, this training operator provides Kubernetes custom resources that makes it easy to -run distributed or non-distributed TensorFlow/PyTorch/Apache MXNet/XGBoost/MPI jobs on Kubernetes. +Kubeflow Training Operator is a Kubernetes-native project for fine-tuning and +scalable distributed training of machine learning (ML) models created with various ML frameworks +such as PyTorch, Tensorflow, XGBoost, MPI, Paddle and others. > Note: Before v1.2 release, Kubeflow Training Operator only supports TFJob on Kubernetes. @@ -23,7 +24,7 @@ run distributed or non-distributed TensorFlow/PyTorch/Apache MXNet/XGBoost/MPI j ## Prerequisites -- Version >= 1.23 of Kubernetes cluster and `kubectl` +- Version >= 1.25 of Kubernetes cluster and `kubectl` ## Installation @@ -36,7 +37,7 @@ kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/stand ### Stable Release ```bash -kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.5.0" +kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=v1.7.0" ``` ### TensorFlow Release Only @@ -49,7 +50,8 @@ kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/stand ### Python SDK for Kubeflow Training Operator -Training Operator provides Python SDK for the custom resources. More docs are available in [sdk/python](sdk/python) folder. +Training Operator provides Python SDK for the custom resources. For the APIs check +[the SDK `TrainingClient`](sdk/python/kubeflow/training/api/training_client.py). Use `pip install` command to install the latest release of the SDK: @@ -57,9 +59,12 @@ Use `pip install` command to install the latest release of the SDK: pip install kubeflow-training ``` -## Quick Start +Training Operator controller and Python SDK have the same release versions. -Please refer to the [quick-start-v1.md](docs/quick-start-v1.md) and [Kubeflow Training User Guide](https://www.kubeflow.org/docs/guides/components/tftraining/) for more information. +## Quickstart + +Please refer to the [getting started guide](https://www.kubeflow.org/docs/components/training/overview/#getting-started) +to quickly create your first Training Operator Job. ## API Documentation @@ -97,7 +102,9 @@ The following table lists the most recent few versions of the operator. | `v1.3.x` | `v1` | 1.18+ | | `v1.4.x` | `v1` | 1.23+ | | `v1.5.x` | `v1` | 1.23+ | -| `latest` (master HEAD) | `v1` | 1.23+ | +| `v1.6.x` | `v1` | 1.23+ | +| `v1.7.x` | `v1` | 1.25+ | +| `latest` (master HEAD) | `v1` | 1.25+ | ## Acknowledgement diff --git a/docs/quick-start-v1.md b/docs/quick-start-v1.md deleted file mode 100644 index 17f687c4cb..0000000000 --- a/docs/quick-start-v1.md +++ /dev/null @@ -1,82 +0,0 @@ -# Testing v1 - -TFJob is currently in v1. The quick start shows an example of TFJob. -For more details please refer to [developer_guide.md](development/developer_guide.md). - -## Create a TFJob - -Please see the [example](../examples/tensorflow/dist-mnist/README.md) to create a TFJob. - -## Monitor your job - -To get the status of your job - -``` -kubectl get -o yaml tfjobs $JOB -``` - -Here is sample output for an example job - -```yaml -apiVersion: kubeflow.org/v1 -kind: TFJob -metadata: - creationTimestamp: 2019-03-06T09:50:49Z - generation: 1 - name: dist-mnist-for-e2e-test - namespace: kubeflow - resourceVersion: "16575458" - selfLink: /apis/kubeflow.org/v1/namespaces/kubeflow/tfjobs/dist-mnist-for-e2e-test - uid: 526545f8-3ff5-11e9-a818-0016ac101ba4 -spec: - cleanPodPolicy: Running - tfReplicaSpecs: - PS: - replicas: 2 - restartPolicy: Never - template: - metadata: - creationTimestamp: null - spec: - containers: - - image: kubeflow/tf-dist-mnist-test:1.0 - name: tensorflow - ports: - - containerPort: 2222 - name: tfjob-port - resources: {} - Worker: - replicas: 4 - restartPolicy: Never - template: - metadata: - creationTimestamp: null - spec: - containers: - - image: kubeflow/tf-dist-mnist-test:1.0 - name: tensorflow - ports: - - containerPort: 2222 - name: tfjob-port - resources: {} -status: - conditions: - - lastTransitionTime: 2019-03-06T09:50:36Z - lastUpdateTime: 2019-03-06T09:50:36Z - message: TFJob dist-mnist-for-e2e-test is created. - reason: TFJobCreated - status: "True" - type: Created - - lastTransitionTime: 2019-03-06T09:50:57Z - lastUpdateTime: 2019-03-06T09:50:57Z - message: TFJob dist-mnist-for-e2e-test is running. - reason: TFJobRunning - status: "True" - type: Running - replicaStatuses: - PS: - active: 2 - Worker: - active: 4 - startTime: 2019-03-06T09:50:48Z -```