Skip to content

Commit

Permalink
[SDK] Add docstring for Train API
Browse files Browse the repository at this point in the history
Signed-off-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
andreyvelich committed Apr 19, 2024
1 parent 7345e33 commit 9b46305
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 17 deletions.
6 changes: 3 additions & 3 deletions sdk/python/kubeflow/storage_initializer/hugging_face.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __post_init__(self):


@dataclass
class HuggingFaceTrainParams:
class HuggingFaceTrainerParams:
training_parameters: transformers.TrainingArguments = field(
default_factory=transformers.TrainingArguments
)
Expand Down Expand Up @@ -77,7 +77,7 @@ def download_model_and_tokenizer(self):


@dataclass
class HfDatasetParams:
class HuggingFaceDatasetParams:
repo_id: str
access_token: Optional[str] = None
# TODO (andreyvelich): Discuss where we should specify dataset preprocess parameters.
Expand All @@ -91,7 +91,7 @@ def __post_init__(self):

class HuggingFaceDataset(datasetProvider):
def load_config(self, serialised_args):
self.config = HfDatasetParams(**json.loads(serialised_args))
self.config = HuggingFaceDatasetParams(**json.loads(serialised_args))

def download_dataset(self):
logger.info("Downloading dataset")
Expand Down
88 changes: 74 additions & 14 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,21 +99,73 @@ def train(
namespace: Optional[str] = None,
num_workers: int = 1,
num_procs_per_worker: int = 1,
resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None,
model_provider_parameters=None,
dataset_provider_parameters=None,
trainer_parameters=None,
storage_config: Dict[str, Optional[Union[str, List[str]]]] = {
"size": constants.PVC_DEFAULT_SIZE,
"storage_class": None,
"access_modes": constants.PVC_DEFAULT_ACCESS_MODES,
},
model_provider_parameters=None,
dataset_provider_parameters=None,
train_parameters=None,
resources_per_worker: Union[dict, client.V1ResourceRequirements, None] = None,
):
"""
Higher level train api
model_provider_parameters: It can be of type HuggingFaceModelParams
dataset_provider_parameters: It can be of type HfDatasetParams or S3DatasetParams
train_parameters: It can be of type HuggingFaceTrainParams
"""High level API to fine-tune LLMs with distributed PyTorchJob. Follow this guide
for more information about this feature: TODO (andreyvelich): Add link.
It uses the pre-created Storage Initializer to download pre-trained model and dataset, and
Trainer to fine-tune LLM. Your cluster should support PVC with ReadOnlyMany access mode
to distribute data across PyTorchJob workers.
It uses `torchrun` CLI to fine-tune model in distributed mode across multiple PyTorchJob
workers. Follow this guide to know more about `torchrun`: https://pytorch.org/docs/stable/elastic/run.html
This feature is in alpha stage and Kubeflow community is looking for your feedback.
Please use #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub
for your questions or suggestions.
Args:
name: Name of the PyTorchJob.
namespace: Namespace for the Job. By default namespace is taken from
`TrainingClient` object.
num_workers: Number of PyTorchJob worker replicas for the Job.
num_procs_per_worker: Number of processes per PyTorchJob worker for `torchrun` CLI.
You can use this parameter if you use more than 1 GPU per PyTorchJob worker.
resources_per_worker: A parameter that lets you specify how much
resources each Worker container should have. You can either specify a
kubernetes.client.V1ResourceRequirements object (documented here:
https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1ResourceRequirements.md)
or a dictionary that includes one or more of the following keys:
`cpu`, `memory`, or `gpu` (other keys will be ignored). Appropriate
values for these keys are documented here:
https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/.
For example:
```
{
"cpu": "1",
"memory": "2Gi",
"gpu": "1",
}
```
Please note, `gpu` specifies a resource request with a key of
`nvidia.com/gpu`, i.e. an NVIDIA GPU. If you need a different type
of GPU, pass in a V1ResourceRequirement instance instead, since it's
more flexible. This parameter is optional and defaults to None.
model_provider_parameters: Parameters for the model provider in the Storage Initializer.
For example, HuggingFace model name and Transformer with this type:
AutoModelForSequenceClassification. This parameter must be the type of
`kubeflow.storage_initializer.hugging_face.HuggingFaceModelParams`
dataset_provider_parameters: Parameters for the dataset provider in the
Storage Initializer. For example, name of the HuggingFace dataset or
AWS S3 configuration. These parameters must be the type of
`kubeflow.storage_initializer.hugging_face.HuggingFaceDatasetParams` or
`kubeflow.storage_initializer.s3.S3DatasetParams`
trainer_parameters: Parameters for LLM Trainer that will fine-tune pre-trained model
with the given dataset. For example, LoRA config for parameter-efficient fine-tuning
and HuggingFace training arguments like optimizer or number of training epochs.
These parameters must be the type of
`kubeflow.storage_initializer.HuggingFaceTrainerParams`
storage_config: Configuration for Storage Initializer PVC to download pre-trained model
and dataset.
"""
try:
import peft
Expand All @@ -126,14 +178,20 @@ def train(
from kubeflow.storage_initializer.s3 import S3DatasetParams
from kubeflow.storage_initializer.hugging_face import (
HuggingFaceModelParams,
HfDatasetParams,
HuggingFaceDatasetParams,
)

print(
"Thank you for using `train` API for LLMs fine-tuning. This feature is in alpha stage "
"Kubeflow community is looking for your feedback. Please share your experience "
"via #kubeflow-training-operator Slack channel or Kubeflow Training Operator GitHub."
)

if (
not name
or not model_provider_parameters
or not dataset_provider_parameters
or not train_parameters
or not trainer_parameters
):
raise ValueError("One of the required parameters is None")

Expand Down Expand Up @@ -172,7 +230,7 @@ def train(

if isinstance(dataset_provider_parameters, S3DatasetParams):
dp = "s3"
elif isinstance(dataset_provider_parameters, HfDatasetParams):
elif isinstance(dataset_provider_parameters, HuggingFaceDatasetParams):
dp = "hf"
else:
raise ValueError(
Expand Down Expand Up @@ -210,9 +268,11 @@ def train(
"--dataset_dir",
VOLUME_PATH_DATASET,
"--lora_config",
json.dumps(train_parameters.lora_config.__dict__, cls=utils.SetEncoder),
json.dumps(
trainer_parameters.lora_config.__dict__, cls=utils.SetEncoder
),
"--training_parameters",
json.dumps(train_parameters.training_parameters.to_dict()),
json.dumps(trainer_parameters.training_parameters.to_dict()),
],
volume_mounts=[constants.STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=resources_per_worker,
Expand Down

0 comments on commit 9b46305

Please sign in to comment.