Skip to content

Commit

Permalink
[SDK] Fix trainer error: Update the version of base image and add "nu…
Browse files Browse the repository at this point in the history
…m_labels" for downloading pretrained models (kubeflow#2230)

* fix trainer error

Signed-off-by: helenxie-bit <[email protected]>

* rerun tests

Signed-off-by: helenxie-bit <[email protected]>

* update the process of num_labels in trainer

Signed-off-by: helenxie-bit <[email protected]>

* rerun tests

Signed-off-by: helenxie-bit <[email protected]>

* adjust the  default value of 'num_labels'

Signed-off-by: helenxie-bit <[email protected]>

---------

Signed-off-by: helenxie-bit <[email protected]>
  • Loading branch information
helenxie-bit authored Aug 28, 2024
1 parent ea5272f commit e9766d1
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 9 deletions.
1 change: 1 addition & 0 deletions sdk/python/kubeflow/storage_initializer/hugging_face.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class HuggingFaceModelParams:
model_uri: str
transformer_type: TRANSFORMER_TYPES
access_token: str = None
num_labels: Optional[int] = None

def __post_init__(self):
# Custom checks or validations can be added here
Expand Down
2 changes: 1 addition & 1 deletion sdk/python/kubeflow/trainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Use an official Pytorch runtime as a parent image
FROM nvcr.io/nvidia/pytorch:23.10-py3
FROM nvcr.io/nvidia/pytorch:24.06-py3

# Set the working directory in the container
WORKDIR /app
Expand Down
26 changes: 18 additions & 8 deletions sdk/python/kubeflow/trainer/hf_llm_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,26 @@
logger.setLevel(logging.INFO)


def setup_model_and_tokenizer(model_uri, transformer_type, model_dir):
def setup_model_and_tokenizer(model_uri, transformer_type, model_dir, num_labels):
# Set up the model and tokenizer
parsed_uri = urlparse(model_uri)
model_name = parsed_uri.netloc + parsed_uri.path

model = transformer_type.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
trust_remote_code=True,
)
if num_labels != "None":
model = transformer_type.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
trust_remote_code=True,
num_labels=int(num_labels),
)
else:
model = transformer_type.from_pretrained(
pretrained_model_name_or_path=model_name,
cache_dir=model_dir,
local_files_only=True,
trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name,
Expand Down Expand Up @@ -151,6 +160,7 @@ def parse_arguments():

parser.add_argument("--model_uri", help="model uri")
parser.add_argument("--transformer_type", help="model transformer type")
parser.add_argument("--num_labels", default="None", help="number of classes")
parser.add_argument("--model_dir", help="directory containing model")
parser.add_argument("--dataset_dir", help="directory containing dataset")
parser.add_argument("--lora_config", help="lora_config")
Expand Down Expand Up @@ -178,7 +188,7 @@ def parse_arguments():

logger.info("Setup model and tokenizer")
model, tokenizer = setup_model_and_tokenizer(
args.model_uri, transformer_type, args.model_dir
args.model_uri, transformer_type, args.model_dir, args.num_labels
)

logger.info("Preprocess dataset")
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/kubeflow/training/api/training_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ def train(
model_provider_parameters.model_uri,
"--transformer_type",
model_provider_parameters.transformer_type.__name__,
"--num_labels",
str(model_provider_parameters.num_labels),
"--model_dir",
VOLUME_PATH_MODEL,
"--dataset_dir",
Expand Down

0 comments on commit e9766d1

Please sign in to comment.