diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md
index 494795f7cb..7f34927153 100644
--- a/docs/proposals/2170-kubeflow-training-v2/README.md
+++ b/docs/proposals/2170-kubeflow-training-v2/README.md
@@ -271,7 +271,7 @@ type TrainJobSpec struct {
TrainingRuntimeRef *TrainingRuntimeRef `json:"trainingRuntimeRef"`
// Parameters that data scientists can override
- TrainerConfig *TrainerConfig `json:"trainerConfig,omitempty"`
+ Trainer *Trainer `json:"trainer,omitempty"`
// Configuration for training dataset
DatasetConfig *DatasetConfig `json:"datasetConfig,omitempty"`
@@ -315,7 +315,7 @@ This table explain rationale for each `TrainJob` parameter:
- TrainerConfig
+ | Trainer
|
Configuration for the Trainer such as image, number of nodes, accelerators.
|
@@ -360,7 +360,7 @@ metadata:
spec:
trainingRuntimeRef:
name: torch-distributed-multi-node
- trainerConfig:
+ trainer:
image: docker.io/custom-training
command:
- torchrun train.py
@@ -418,17 +418,22 @@ spec:
trainingRuntimeRef:
name: torch-tune-llama-7b
datasetConfig:
- s3:
- bucket: datasets
- path: custom-datasets/yelp-review
+ storageUri: s3://dataset/custom-dataset/yelp-review
+ parameters:
+ split: train[:5000]
+ modelConfig:
+ input:
+ storageUri: hf://yelp-review-full
+ output:
+ storageUri: s3://trained-model
```
-### The Trainer Config API
+### The Trainer API
-The `TrainerConfig` represents the APIs that data scientists can use to configure trainer settings:
+The `Trainer` represents the APIs that data scientists can use to configure trainer settings:
```golang
-type TrainerConfig struct {
+type Trainer struct {
// Docker image for the Trainer.
Image string `json:"image,omitempty"`
@@ -455,11 +460,11 @@ type TrainerConfig struct {
}
```
-The following table explains how `TrainingRuntime` parameters will be overridden with `TrainerConfig`.
+The following table explains how `TrainingRuntime` parameters will be overridden with `Trainer`.
- TrainerConfig Parameter
+ | Trainer Parameter
|
TrainingRuntime Parameter
|
@@ -509,110 +514,53 @@ The `DatasetConfig` represents the APIs that data scientists can use to configur
```golang
type DatasetConfig struct {
- // One of the following can be set.
- HFDatasetProvider *kubeflowv1.HFDatasetProvider `json:"huggingface,omitempty"`
+ // Storage uri for the dataset provider.
+ StorageUri string `json:"storageUri"`
- S3DatasetProvider *kubeflowv1.S3DatasetProvider `json:"s3,omitempty"`
+ // Custom parameters for the dataset initializer.
+ Parameters *[string]string `json:"parameters,omitempty"`
- // (Optional). We can support the Iceberg dataset using PyIceberg.
- IcebergDatasetProvider *kubeflowv1.IcebergDatasetProvider `json:"iceberg,omitempty"`
-}
-
-type HFDatasetProvider struct {
- // Path to the HF dataset. For example: yelp-review-full
- RepoId string `json:"repoId"`
-
- // Whether the dataset needs to be splitted: train/val. E.g. split=train[:50000]
- Split string `json:"split,omitempty"`
-
- // Secret must contain HF_TOKEN secret.
- // If the secret object contains more than one secret, all secrets are passed.
- AccessTokenSecretRef corev1.SecretReference `json:"accessToken,omitempty"`
+ // Reference to the secrets to access dataset.
+ SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
}
+```
+Initially we will support the following dataset providers:
-type S3DatasetProvider struct {
- // S3 endpoint.
- EndpointUrl string `json:"endpointUrl,omitempty"`
+- **S3:** `storageUri: s3://bucket-name/path/dataset`
+- **HuggingFace:** `storageUri: hf://repo-id`
- // Name of the S3 bucket.
- BucketName string `json:bucketName`
+Parameters will be converted to the environment variables for the `dataset-initializer` container
+in the `Initializer` Job.
- // Path to the dataset. All files will be downloaded in that path.
- Path string `json:path`
+For example:
- // Secret must contain AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY secret.
- // If the secret object contains more than one secret, all secrets are passed.
- // Otherwise, IRSA can be used to access S3.
- AccessTokenSecretRef corev1.SecretReference `json:"accessToken,omitempty"`
-}
+```yaml
+datasetConfig:
+ storageUri: s3://datasets/yelp-review
+ parameters:
+ endpointUrl: s3.custom.com
```
-The following tables explains how `TrainingRuntime` parameters will be overridden with the
-`DatasetConfig`.
-
-All parameters will be set for this container:
+Will be converted to:
```yaml
-.spec.replicatedJobs[name='Initializer'].template.spec.template.spec.containers[name=dataset-initializer]
+replicatedJobs:
+ - name: Initializer
+ template:
+ spec:
+ template:
+ spec:
+ containers:
+ - name: dataset-initializer
+ image: docker.io/kubeflow/dataset-initializer
+ env:
+ - name: STORAGE_URI
+ value: s3://dataset/yelp-review
+ - name: ENDPOINT_URL
+ value: s3.custom.com
```
-#### The HuggingFace Provider
-
-For the HuggingFace provider environment variable `PROVIDER=hf`.
-
-
-
- DatasetConfig Parameter
- |
- TrainingRuntime Parameter
- |
-
-
- .huggingface.repoId
- |
- .env[REPO_ID]
- |
-
-
- .huggingface.split
- |
- .env[SPLIT]
- |
-
-
-
-#### The S3 Provider
-
-For the S3 provider environment variable `PROVIDER=s3`.
-
-
-
- DatasetConfig Parameter
- |
- TrainingRuntime Parameter
- |
-
-
- .s3.endpointUrl
- |
- .env[ENDPOINT_URL]
- |
-
-
- .s3.bucketName
- |
- .env[BUCKET_NAME]
- |
-
-
- .s3.path
- |
- .env[PATH]
- |
-
-
-
### The Model Config API
The `ModelConfig` represents the APIs that data scientists can use to configure pre-trained model
@@ -620,59 +568,171 @@ input and output location.
```golang
type ModelConfig struct {
- // One of the following can be set.
- HFModelProvider *kubeflowv1.HFModelProvider `json:"huggingface,omitempty"`
+ // Configuration for pre-trained model.
+ Input *InputModel `json:"input,omitempty"`
- // Potential output location for fine-tuned/trained model.
- OutputArtifact string
+ // Configuration for trained model.
+ Output *OutputModel `json:"output,omitempty"`
}
+type InputModel struct {
+ // Storage uri for the model provider.
+ StorageUri string `json:"storageUri"`
-type HFModelProvider struct {
- // Path to the pre-trained model. google-bert/bert-base-uncased
- RepoID string `json:"repoID"`
+ // Custom parameters for the model initializer.
+ Parameters *[string]string `json:"parameters,omitempty"`
- // TODO (andreyvelich): Do we want to support any Transformer ?
- TransformerType string `json:"transformerType,omitempty"`
+ // Reference to the secrets to access model.
+ SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
+}
- // Secret must contain HF_TOKEN secret.
- // If the secret object contains more than one secret, all secrets are passed.
- AccessTokenSecretRef corev1.SecretReference `json:"accessToken,omitempty"`
+type OutputModel struct {
+ // Storage uri for the model exported.
+ StorageUri string `json:"storageUri"`
+
+ // Custom parameters for the model exporter.
+ Parameters *[string]string `json:"parameters,omitempty"`
+
+ // Reference to the secrets to export model.
+ SecretRef corev1.SecretReference `json:"secretRef,omitempty"`
}
```
-The following table explains how `TrainingRuntime` parameters will be overridden with `ModelConfig`.
+#### The Input Model API
+
+Initially we will support the following model providers:
+
+- **HuggingFace:** `storageUri: hf://model-name`
+
+Parameters will be converted to the environment variables for the `model-initializer` container
+in the `Initializer` Job.
-All parameters will be set for this container:
+For example:
```yaml
-.spec.replicatedJobs[name='Initializer'].template.spec.template.spec.containers[name=model-initializer]
+modelConfig:
+ storageUri: hf://bert-based-cased
+ parameters:
+ transformerType: AutoModelForCausalLM
```
-#### The HuggingFace Provider
+Will be converted to:
-For the HuggingFace provider environment variable `PROVIDER=hf`.
+```yaml
+replicatedJobs:
+ - name: Initializer
+ template:
+ spec:
+ template:
+ spec:
+ containers:
+ - name: model-initializer
+ image: docker.io/kubeflow/model-initializer
+ env:
+ - name: STORAGE_URI
+ value: hf://bert-based-cased
+ - name: TRANSFORMER_TYPE
+ value: AutoModelForCausalLM
+```
-
-
- DatasetConfig Parameter
- |
- TrainingRuntime Parameter
- |
-
-
- .huggingface.repoId
- |
- .env[REPO_ID]
- |
-
-
- .huggingface.split
- |
- .env[SPLIT]
- |
-
-
+#### The Output Model API
+
+After initial implementation of `TrainJob` and `TrainingRuntime`, we will support ability to export
+the trained model. The following runtime can be implemented:
+
+```yaml
+apiVersion: kubeflow.org/v2alpha1
+kind: ClusterTrainingRuntime
+metadata:
+ name: torch-tune-llama-7b-export
+spec:
+ numNodes: 1
+ startupPolicy:
+ startupPolicyOrder: InOrder
+ replicatedJobs:
+ - name: Initializer
+ template:
+ spec:
+ template:
+ spec:
+ containers:
+ - name: dataset-initializer
+ image: docker.io/kubeflow/dataset-initializer
+ env:
+ - name: STORAGE_URI
+ value: hf://tatsu-lab/alpaca
+ volumeMounts:
+ - mountPath: /workspace/dataset
+ name: dataset-initializer
+ - name: model-initializer
+ image: docker.io/kubeflow/model-initializer
+ env:
+ - name: STORAGE_URI
+ value: hf://meta-llama/Llama-2-7b
+ volumeMounts:
+ - mountPath: /workspace/model
+ name: model-initializer
+ volumes:
+ - name: dataset-initializer
+ persistentVolumeClaim:
+ claimName: dataset-initializer
+ - name: model-initializer
+ persistentVolumeClaim:
+ claimName: model-initializer
+ - name: Node
+ template:
+ spec:
+ template:
+ spec:
+ containers:
+ - name: trainer
+ image: docker.io/kubeflow/llm-trainer
+ env:
+ - name: MASTER_ADDR
+ value: "pytorch-node-0-0.pytorch"
+ - name: MASTER_PORT
+ value: 29400
+ - name: LORA_CONFIG
+ value: |
+ {"peft_type": "LORA", "r": 8, "lora_alpha": 16}
+ command:
+ - torchrun hf_llm_training.py
+ resources:
+ limits:
+ nvidia.com/gpu: 2
+ volumeMounts:
+ - mountPath: /workspace/dataset
+ name: dataset-initializer
+ - mountPath: /workspace/pre-trained-model
+ name: model-initializer
+ - mountPath: /workspace/adapters
+ name: model-exporter
+ volumes:
+ - name: dataset-initializer
+ persistentVolumeClaim:
+ claimName: dataset-initializer
+ - name: model-initializer
+ persistentVolumeClaim:
+ claimName: model-initializer
+ - name: model-exporter
+ persistentVolumeClaim:
+ claimName: model-exporter
+ - name: Exporter
+ template:
+ spec:
+ template:
+ spec:
+ containers:
+ - name: model-exporter
+ image: docker.io/kubeflow/model-exporter
+ volumeMounts:
+ - mountPath: /workspace/adapters
+ name: model-exporter
+ volumes:
+ - name: model-exporter
+ persistentVolumeClaim:
+ claimName: model-exporter
+```
### The Pod Spec Overrides APIs
@@ -703,7 +763,7 @@ type PodSpecOverride struct {
}
// Override for each container.
-// Parameters from TrainerConfig, DatasetConfig, and ModelConfig will take precedence.
+// Parameters from Trainer, DatasetConfig, and ModelConfig will take precedence.
type Container struct {
// Name for the container.
@@ -740,7 +800,7 @@ metadata:
spec:
trainingRuntimeRef:
name: pytorch-distributed-gpu
- trainerConfig:
+ trainer:
image: docker.io/custom-training
podSpecOverrides:
- targetReplicatedJobs:
@@ -805,7 +865,7 @@ type TrainingRuntime struct {
JobSetSpec *batchv1.JobSetSpec `json:",inline"`
// For gang-scheduling using volcano or scheduler plugins, supported for all frameworks.
- GangScheduler *kubeflowv1.GangScheduler `json:"gangScheduler,omitempty"`
+ GangScheduler *GangScheduler `json:"gangScheduler,omitempty"`
}
// One of the specs can be selected.
@@ -836,7 +896,7 @@ type GangSchedulerPlugin string
const (
GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
- GangSchedulerPluginSP GangSchedulerPlugin = "scheduler-plugins"
+ GangSchedulerPlugins GangSchedulerPlugin = "scheduler-plugins"
)
```
@@ -971,7 +1031,7 @@ metadata:
spec:
trainingRuntimeRef:
name: torch-distributed-multi-node
- trainerConfig:
+ trainer:
resourcesPerNode:
requests:
nvidia.com/gpu: 1
@@ -1102,20 +1162,16 @@ spec:
- name: dataset-initializer
image: docker.io/kubeflow/dataset-initializer
env:
- - name: DATASET_PROVIDER
- value: hf
- - name: REPO_ID
- value: tatsu-lab/alpaca
+ - name: STORAGE_URI
+ value: hf://tatsu-lab/alpaca
volumeMounts:
- mountPath: /workspace/dataset
name: dataset-initializer
- name: model-initializer
image: docker.io/kubeflow/model-initializer
env:
- - name: MODEL_PROVIDER
- value: hf
- - name: REPO_ID
- value: meta-llama/Llama-2-7b
+ - name: STORAGE_URI
+ value: hf://meta-llama/Llama-2-7b
- name: TRANSFORMER_TYPE
value: AutoModelForCausalLM
volumeMounts:
@@ -1188,20 +1244,16 @@ spec:
- name: dataset-initializer
image: docker.io/kubeflow/dataset-initializer
env:
- - name: DATASET_PROVIDER
- value: hf
- - name: REPO_ID
- value: tatsu-lab/alpaca
+ - name: STORAGE_URI
+ value: hf://tatsu-lab/alpaca
volumeMounts:
- mountPath: /workspace/dataset
name: dataset-initializer
- name: model-initializer
image: docker.io/kubeflow/model-initializer
env:
- - name: MODEL_PROVIDER
- value: hf
- - name: REPO_ID
- value: google/gemma-7b
+ - name: STORAGE_URI
+ value: hf://google/gemma-7b
- name: TRANSFORMER_TYPE
value: AutoModelForCausalLM
volumeMounts:
@@ -1277,6 +1329,8 @@ spec:
containers:
- name: mpi-launcher
image: docker.io/mpi-launch
+ command:
+ - mpirun -np 5 --host mpi-simple.default.svc
- name: Node
template:
spec: