diff --git a/sdk/python/kubeflow/storage_initializer/hugging_face.py b/sdk/python/kubeflow/storage_initializer/hugging_face.py index 23cdbf69eb..bb6eb6a1c0 100644 --- a/sdk/python/kubeflow/storage_initializer/hugging_face.py +++ b/sdk/python/kubeflow/storage_initializer/hugging_face.py @@ -37,6 +37,7 @@ class HuggingFaceModelParams: model_uri: str transformer_type: TRANSFORMER_TYPES access_token: str = None + num_labels: Optional[int] = None def __post_init__(self): # Custom checks or validations can be added here diff --git a/sdk/python/kubeflow/trainer/Dockerfile b/sdk/python/kubeflow/trainer/Dockerfile index d0ebee4aa3..6b98e3de31 100644 --- a/sdk/python/kubeflow/trainer/Dockerfile +++ b/sdk/python/kubeflow/trainer/Dockerfile @@ -1,5 +1,5 @@ # Use an official Pytorch runtime as a parent image -FROM nvcr.io/nvidia/pytorch:23.10-py3 +FROM nvcr.io/nvidia/pytorch:24.06-py3 # Set the working directory in the container WORKDIR /app diff --git a/sdk/python/kubeflow/trainer/hf_llm_training.py b/sdk/python/kubeflow/trainer/hf_llm_training.py index 48f41d642f..a79445bae2 100644 --- a/sdk/python/kubeflow/trainer/hf_llm_training.py +++ b/sdk/python/kubeflow/trainer/hf_llm_training.py @@ -28,17 +28,26 @@ logger.setLevel(logging.INFO) -def setup_model_and_tokenizer(model_uri, transformer_type, model_dir): +def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels): # Set up the model and tokenizer parsed_uri = urlparse(model_uri) model_name = parsed_uri.netloc + parsed_uri.path - model = transformer_type.from_pretrained( - pretrained_model_name_or_path=model_name, - cache_dir=model_dir, - local_files_only=True, - trust_remote_code=True, - ) + if num_labels != "None": + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + num_labels=int(num_labels), + ) + else: + model = transformer_type.from_pretrained( + pretrained_model_name_or_path=model_name, + cache_dir=model_dir, + local_files_only=True, + trust_remote_code=True, + ) tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=model_name, @@ -151,6 +160,7 @@ def parse_arguments(): parser.add_argument("--model_uri", help="model uri") parser.add_argument("--transformer_type", help="model transformer type") + parser.add_argument("--num_labels", default="None", help="number of classes") parser.add_argument("--model_dir", help="directory containing model") parser.add_argument("--dataset_dir", help="directory containing dataset") parser.add_argument("--lora_config", help="lora_config") @@ -178,7 +188,7 @@ def parse_arguments(): logger.info("Setup model and tokenizer") model, tokenizer = setup_model_and_tokenizer( - args.model_uri, transformer_type, args.model_dir + args.model_uri, transformer_type, args.model_dir, args.num_labels ) logger.info("Preprocess dataset") diff --git a/sdk/python/kubeflow/training/api/training_client.py b/sdk/python/kubeflow/training/api/training_client.py index c136130858..78bf0df7f1 100644 --- a/sdk/python/kubeflow/training/api/training_client.py +++ b/sdk/python/kubeflow/training/api/training_client.py @@ -265,6 +265,8 @@ def train( model_provider_parameters.model_uri, "--transformer_type", model_provider_parameters.transformer_type.__name__, + "--num_labels", + str(model_provider_parameters.num_labels), "--model_dir", VOLUME_PATH_MODEL, "--dataset_dir",