From 1e80e62bb2f141ddb0e6a04710ce9e8802f11b32 Mon Sep 17 00:00:00 2001 From: Fiona Waters Date: Thu, 23 May 2024 16:49:29 +0100 Subject: [PATCH] updating training script in basic_interactive notebook --- .../additional-demos/hf_interactive.ipynb | 125 ++++++++++-------- .../guided-demos/2_basic_interactive.ipynb | 119 +++++++++-------- .../2_basic_interactive.ipynb | 115 +++++++++------- .../preview_nbs/2_basic_interactive.ipynb | 115 +++++++++------- tests/unit_test.py | 4 + 5 files changed, 269 insertions(+), 209 deletions(-) diff --git a/demo-notebooks/additional-demos/hf_interactive.ipynb b/demo-notebooks/additional-demos/hf_interactive.ipynb index 9a362321b..a7b004cbb 100644 --- a/demo-notebooks/additional-demos/hf_interactive.ipynb +++ b/demo-notebooks/additional-demos/hf_interactive.ipynb @@ -13,7 +13,7 @@ "id": "d4acfb10-1aa1-445d-947e-396ea5ebed1a", "metadata": {}, "source": [ - "In this notebook you will learn how to leverage the **[huggingface](https://huggingface.co/)** support in ray ecosystem to carry out a text classification task using transfer learning. We will be referencing the example **[here](https://huggingface.co/docs/transformers/tasks/sequence_classification)**" + "In this notebook you will learn how to leverage the **[huggingface](https://huggingface.co/)** support in ray ecosystem to carry out a text classification task using transfer learning. We will be referencing the examples **[here](https://huggingface.co/docs/transformers/tasks/sequence_classification)** and **[here](https://docs.ray.io/en/latest/train/getting-started-transformers.html)**." ] }, { @@ -21,9 +21,7 @@ "id": "70b77929-e96c-434e-ada3-8b14795bfbb1", "metadata": {}, "source": [ - "The example carries out a text classification task on **[imdb dataset](https://huggingface.co/datasets/imdb)** and tries to classify the movie reviews as positive or negative. Huggingface library provides an easy way to build a model and the dataset to carry out this classification task. In this case we will be using **distilbert-base-uncased** model which is a **BERT** based model.\n", - "\n", - "Huggingface has a **[built in support for ray ecosystem](https://docs.ray.io/en/releases-1.13.0/_modules/ray/ml/train/integrations/huggingface/huggingface_trainer.html)** which allows the huggingface trainer to scale on CodeFlare and can scale the training as we add additional gpus and can run distributed training across multiple GPUs that will help scale out the training.\n" + "The example carries out a text classification task on **[imdb dataset](https://huggingface.co/datasets/imdb)** and tries to classify the movie reviews as positive or negative. Huggingface library provides an easy way to build a model and the dataset to carry out this classification task. In this case we will be using **distilbert-base-uncased** model which is a **BERT** based model." ] }, { @@ -317,14 +315,13 @@ "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", "\n", "import ray\n", - "from ray.air.config import ScalingConfig\n", "\n", "# reset the ray context in case there's already one. \n", "ray.shutdown()\n", "# establish connection to ray cluster\n", "\n", "#install additional libraries that will be required for this training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", + "runtime_env = {\"pip\": [\"transformers==4.41.2\", \"datasets==2.17.0\", \"accelerate==0.31.0\", \"scikit-learn==1.5.0\"]}\n", "\n", "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n", "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n", @@ -354,7 +351,7 @@ "id": "8bdbe888-4f38-4e9a-ae43-67ce89ff9d42", "metadata": {}, "source": [ - "We are using the code based on the example **[here](https://huggingface.co/docs/transformers/tasks/sequence_classification)** . " + "We are using the code based on the examples **[here](https://huggingface.co/docs/transformers/tasks/sequence_classification)** and **[here](https://docs.ray.io/en/latest/train/getting-started-transformers.html)**. " ] }, { @@ -366,66 +363,83 @@ "source": [ "@ray.remote\n", "def train_fn():\n", - " from datasets import load_dataset\n", - " import transformers\n", - " from transformers import AutoTokenizer, TrainingArguments\n", - " from transformers import AutoModelForSequenceClassification\n", + " import os\n", " import numpy as np\n", - " from datasets import load_metric\n", - " import ray\n", - " from ray import tune\n", - " from ray.train.huggingface import HuggingFaceTrainer\n", - "\n", - " dataset = load_dataset(\"imdb\")\n", - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", - "\n", - " def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + " from datasets import load_dataset, load_metric\n", + " import transformers\n", + " from transformers import (\n", + " Trainer,\n", + " TrainingArguments,\n", + " AutoTokenizer,\n", + " AutoModelForSequenceClassification,\n", + " )\n", + " import ray.train.huggingface.transformers\n", + " from ray.train import ScalingConfig\n", + " from ray.train.torch import TorchTrainer\n", "\n", - " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", + " # When running in a multi-node cluster you will need persistent storage that is accessible across all worker nodes. \n", + " # See www.github.com/project-codeflare/codeflare-sdk/tree/main/docs/s3-compatible-storage.md for more information.\n", + " \n", + " def train_func():\n", + " # Datasets\n", + " dataset = load_dataset(\"imdb\")\n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - " #using a fraction of dataset but you can run with the full dataset\n", - " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", - " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", "\n", - " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", + " small_train_dataset = (\n", + " dataset[\"train\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", + " small_eval_dataset = (\n", + " dataset[\"test\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", "\n", - " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", - " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", + " # Model\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"distilbert-base-uncased\", num_labels=2\n", + " )\n", "\n", - " def compute_metrics(eval_pred):\n", - " metric = load_metric(\"accuracy\")\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", "\n", - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", + " # Hugging Face Trainer\n", + " training_args = TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=\"none\",\n", + " )\n", "\n", - " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", - " num_train_epochs=1, skip_memory_metrics=True,\n", - " learning_rate=2e-5,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=16, \n", - " weight_decay=0.01,)\n", - " return transformers.Trainer(\n", + " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", " )\n", "\n", - " scaling_config = ScalingConfig(num_workers=4, use_gpu=True) #num workers is the number of gpus\n", "\n", - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", - " trainer = HuggingFaceTrainer(\n", - " trainer_init_per_worker=trainer_init_per_worker,\n", - " scaling_config=scaling_config,\n", - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", + " trainer.add_callback(callback)\n", + "\n", + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", + "\n", + " trainer.train()\n", + "\n", + "\n", + " ray_trainer = TorchTrainer(\n", + " train_func,\n", + " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " # Configure persistent storage that is accessible across \n", + " # all worker nodes.\n", + " # Uncomment and update the RunConfig below to include your storage details.\n", + " # run_config=ray.train.RunConfig(storage_path=\"storage path\"),\n", " )\n", - " result = trainer.fit()\n" + " result: ray.train.Result = ray_trainer.fit()" ] }, { @@ -1443,10 +1457,7 @@ "metadata": {}, "source": [ "## Conclusion\n", - "As shown in the above example, you can easily run your Huggingface transfer learning tasks easily and natively on CodeFlare. You can scale them from 1 to n GPUs without requiring you to make any significant code changes and leveraging the native Huggingface trainer. \n", - "\n", - "Also refer to additional notebooks that showcase other use cases\n", - "In our next notebook [./02_codeflare_workflows_encoding.ipynb ] shows an sklearn example and how you can leverage workflows to run experiment pipelines and explore multiple pipelines in parallel on CodeFlare cluster. \n" + "As shown in the above example, you can run your Huggingface transfer learning tasks easily and natively on CodeFlare. You can scale them from 1 to n GPUs without requiring you to make any significant code changes and leveraging the native Huggingface trainer. " ] }, { diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb index 6cee82e1d..86142714e 100644 --- a/demo-notebooks/guided-demos/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -148,19 +148,17 @@ "metadata": {}, "outputs": [], "source": [ - "#before proceeding make sure the cluster exists and the uri is not empty\n", + "# before proceeding make sure the cluster exists and the uri is not empty\n", "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", "\n", "import ray\n", - "from ray.air.config import ScalingConfig\n", "\n", "# reset the ray context in case there's already one. \n", "ray.shutdown()\n", "# establish connection to ray cluster\n", "\n", - "#install additional libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", - "\n", + "# install additional libraries that will be required for model training\n", + "runtime_env = {\"pip\": [\"transformers==4.41.2\", \"datasets==2.17.0\", \"accelerate==0.31.0\", \"scikit-learn==1.5.0\"]}\n", "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n", "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n", "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n", @@ -173,7 +171,7 @@ "id": "9711030b", "metadata": {}, "source": [ - "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + "Now that we are connected (and have passed in some package requirements), let's try writing some training code:" ] }, { @@ -185,66 +183,83 @@ "source": [ "@ray.remote\n", "def train_fn():\n", - " from datasets import load_dataset\n", - " import transformers\n", - " from transformers import AutoTokenizer, TrainingArguments\n", - " from transformers import AutoModelForSequenceClassification\n", + " import os\n", " import numpy as np\n", - " from datasets import load_metric\n", - " import ray\n", - " from ray import tune\n", - " from ray.train.huggingface import HuggingFaceTrainer\n", - "\n", - " dataset = load_dataset(\"imdb\")\n", - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", - "\n", - " def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + " from datasets import load_dataset, load_metric\n", + " import transformers\n", + " from transformers import (\n", + " Trainer,\n", + " TrainingArguments,\n", + " AutoTokenizer,\n", + " AutoModelForSequenceClassification,\n", + " )\n", + " import ray.train.huggingface.transformers\n", + " from ray.train import ScalingConfig\n", + " from ray.train.torch import TorchTrainer\n", "\n", - " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", + " # When running in a multi-node cluster you will need persistent storage that is accessible across all worker nodes. \n", + " # See www.github.com/project-codeflare/codeflare-sdk/tree/main/docs/s3-compatible-storage.md for more information.\n", + " \n", + " def train_func():\n", + " # Datasets\n", + " dataset = load_dataset(\"imdb\")\n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - " #using a fraction of dataset but you can run with the full dataset\n", - " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", - " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", "\n", - " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", + " small_train_dataset = (\n", + " dataset[\"train\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", + " small_eval_dataset = (\n", + " dataset[\"test\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", "\n", - " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", - " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", + " # Model\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"distilbert-base-uncased\", num_labels=2\n", + " )\n", "\n", - " def compute_metrics(eval_pred):\n", - " metric = load_metric(\"accuracy\")\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", "\n", - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", + " # Hugging Face Trainer\n", + " training_args = TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=\"none\",\n", + " )\n", "\n", - " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", - " num_train_epochs=1, skip_memory_metrics=True,\n", - " learning_rate=2e-5,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=16, \n", - " weight_decay=0.01,)\n", - " return transformers.Trainer(\n", + " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", " )\n", "\n", - " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", "\n", - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", - " trainer = HuggingFaceTrainer(\n", - " trainer_init_per_worker=trainer_init_per_worker,\n", - " scaling_config=scaling_config,\n", - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", + " trainer.add_callback(callback)\n", + "\n", + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", + "\n", + " trainer.train()\n", + "\n", + "\n", + " ray_trainer = TorchTrainer(\n", + " train_func,\n", + " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " # Configure persistent storage that is accessible across \n", + " # all worker nodes.\n", + " # Uncomment and update the RunConfig below to include your storage details.\n", + " # run_config=ray.train.RunConfig(storage_path=\"storage path\"),\n", " )\n", - " result = trainer.fit()" + " result: ray.train.Result = ray_trainer.fit()" ] }, { diff --git a/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb index ff88d6217..5ae66a0cf 100644 --- a/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/notebook-ex-outputs/2_basic_interactive.ipynb @@ -244,15 +244,13 @@ "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", "\n", "import ray\n", - "from ray.air.config import ScalingConfig\n", "\n", "# reset the ray context in case there's already one. \n", "ray.shutdown()\n", "# establish connection to ray cluster\n", "\n", "#install additional libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", - "\n", + "runtime_env = {\"pip\": [\"transformers==4.41.2\", \"datasets==2.17.0\", \"accelerate==0.31.0\", \"scikit-learn==1.5.0\"]}\n", "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n", "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n", "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n", @@ -265,7 +263,7 @@ "id": "9711030b", "metadata": {}, "source": [ - "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + "Now that we are connected (and have passed in some package requirements), let's try writing some training code:" ] }, { @@ -277,66 +275,83 @@ "source": [ "@ray.remote\n", "def train_fn():\n", - " from datasets import load_dataset\n", - " import transformers\n", - " from transformers import AutoTokenizer, TrainingArguments\n", - " from transformers import AutoModelForSequenceClassification\n", + " import os\n", " import numpy as np\n", - " from datasets import load_metric\n", - " import ray\n", - " from ray import tune\n", - " from ray.train.huggingface import HuggingFaceTrainer\n", - "\n", - " dataset = load_dataset(\"imdb\")\n", - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", - "\n", - " def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + " from datasets import load_dataset, load_metric\n", + " import transformers\n", + " from transformers import (\n", + " Trainer,\n", + " TrainingArguments,\n", + " AutoTokenizer,\n", + " AutoModelForSequenceClassification,\n", + " )\n", + " import ray.train.huggingface.transformers\n", + " from ray.train import ScalingConfig\n", + " from ray.train.torch import TorchTrainer\n", "\n", - " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", + " # When running in a multi-node cluster you will need persistent storage that is accessible across all worker nodes. \n", + " # See www.github.com/project-codeflare/codeflare-sdk/tree/main/docs/s3-compatible-storage.md for more information.\n", + " \n", + " def train_func():\n", + " # Datasets\n", + " dataset = load_dataset(\"imdb\")\n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - " #using a fraction of dataset but you can run with the full dataset\n", - " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", - " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", "\n", - " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", + " small_train_dataset = (\n", + " dataset[\"train\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", + " small_eval_dataset = (\n", + " dataset[\"test\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", "\n", - " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", - " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", + " # Model\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"distilbert-base-uncased\", num_labels=2\n", + " )\n", "\n", - " def compute_metrics(eval_pred):\n", - " metric = load_metric(\"accuracy\")\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", "\n", - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", + " # Hugging Face Trainer\n", + " training_args = TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=\"none\",\n", + " )\n", "\n", - " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", - " num_train_epochs=1, skip_memory_metrics=True,\n", - " learning_rate=2e-5,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=16, \n", - " weight_decay=0.01,)\n", - " return transformers.Trainer(\n", + " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", " )\n", "\n", - " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", "\n", - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", - " trainer = HuggingFaceTrainer(\n", - " trainer_init_per_worker=trainer_init_per_worker,\n", - " scaling_config=scaling_config,\n", - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", + " trainer.add_callback(callback)\n", + "\n", + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", + "\n", + " trainer.train()\n", + "\n", + "\n", + " ray_trainer = TorchTrainer(\n", + " train_func,\n", + " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " # Configure persistent storage that is accessible across \n", + " # all worker nodes.\n", + " # Uncomment and update the RunConfig below to include your storage details.\n", + " # run_config=ray.train.RunConfig(storage_path=\"storage path\"),\n", " )\n", - " result = trainer.fit()" + " result: ray.train.Result = ray_trainer.fit()" ] }, { diff --git a/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb index 44846f612..d7d4d69dc 100644 --- a/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/preview_nbs/2_basic_interactive.ipynb @@ -152,15 +152,13 @@ "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", "\n", "import ray\n", - "from ray.air.config import ScalingConfig\n", "\n", "# reset the ray context in case there's already one. \n", "ray.shutdown()\n", "# establish connection to ray cluster\n", "\n", "#install additional libraries that will be required for model training\n", - "runtime_env = {\"pip\": [\"transformers\", \"datasets\", \"evaluate\", \"pyarrow<7.0.0\", \"accelerate\"]}\n", - "\n", + "runtime_env = {\"pip\": [\"transformers==4.41.2\", \"datasets==2.17.0\", \"accelerate==0.31.0\", \"scikit-learn==1.5.0\"]}\n", "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n", "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n", "ray.init(address=ray_cluster_uri, runtime_env=runtime_env)\n", @@ -173,7 +171,7 @@ "id": "9711030b", "metadata": {}, "source": [ - "Now that we are connected (and have passed in some package requirements), let's try writing some training code for a DistilBERT transformer model via HuggingFace (using IMDB dataset):" + "Now that we are connected (and have passed in some package requirements), let's try writing some training code:" ] }, { @@ -185,66 +183,83 @@ "source": [ "@ray.remote\n", "def train_fn():\n", - " from datasets import load_dataset\n", - " import transformers\n", - " from transformers import AutoTokenizer, TrainingArguments\n", - " from transformers import AutoModelForSequenceClassification\n", + " import os\n", " import numpy as np\n", - " from datasets import load_metric\n", - " import ray\n", - " from ray import tune\n", - " from ray.train.huggingface import HuggingFaceTrainer\n", - "\n", - " dataset = load_dataset(\"imdb\")\n", - " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", - "\n", - " def tokenize_function(examples):\n", - " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + " from datasets import load_dataset, load_metric\n", + " import transformers\n", + " from transformers import (\n", + " Trainer,\n", + " TrainingArguments,\n", + " AutoTokenizer,\n", + " AutoModelForSequenceClassification,\n", + " )\n", + " import ray.train.huggingface.transformers\n", + " from ray.train import ScalingConfig\n", + " from ray.train.torch import TorchTrainer\n", "\n", - " tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", + " # When running in a multi-node cluster you will need persistent storage that is accessible across all worker nodes. \n", + " # See www.github.com/project-codeflare/codeflare-sdk/tree/main/docs/s3-compatible-storage.md for more information.\n", + " \n", + " def train_func():\n", + " # Datasets\n", + " dataset = load_dataset(\"imdb\")\n", + " tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", "\n", - " #using a fraction of dataset but you can run with the full dataset\n", - " small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(100))\n", - " small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))\n", + " def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", "\n", - " print(f\"len of train {small_train_dataset} and test {small_eval_dataset}\")\n", + " small_train_dataset = (\n", + " dataset[\"train\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", + " small_eval_dataset = (\n", + " dataset[\"test\"].select(range(100)).map(tokenize_function, batched=True)\n", + " )\n", "\n", - " ray_train_ds = ray.data.from_huggingface(small_train_dataset)\n", - " ray_evaluation_ds = ray.data.from_huggingface(small_eval_dataset)\n", + " # Model\n", + " model = AutoModelForSequenceClassification.from_pretrained(\n", + " \"distilbert-base-uncased\", num_labels=2\n", + " )\n", "\n", - " def compute_metrics(eval_pred):\n", - " metric = load_metric(\"accuracy\")\n", - " logits, labels = eval_pred\n", - " predictions = np.argmax(logits, axis=-1)\n", - " return metric.compute(predictions=predictions, references=labels)\n", + " def compute_metrics(eval_pred):\n", + " metric = load_metric(\"accuracy\")\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)\n", "\n", - " def trainer_init_per_worker(train_dataset, eval_dataset, **config):\n", - " model = AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n", + " # Hugging Face Trainer\n", + " training_args = TrainingArguments(\n", + " output_dir=\"test_trainer\",\n", + " evaluation_strategy=\"epoch\",\n", + " save_strategy=\"epoch\",\n", + " report_to=\"none\",\n", + " )\n", "\n", - " training_args = TrainingArguments(\"/tmp/hf_imdb/test\", eval_steps=1, disable_tqdm=True, \n", - " num_train_epochs=1, skip_memory_metrics=True,\n", - " learning_rate=2e-5,\n", - " per_device_train_batch_size=16,\n", - " per_device_eval_batch_size=16, \n", - " weight_decay=0.01,)\n", - " return transformers.Trainer(\n", + " trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " compute_metrics=compute_metrics\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", " )\n", "\n", - " scaling_config = ScalingConfig(num_workers=2, use_gpu=True) #num workers is the number of gpus\n", "\n", - " # we are using the ray native HuggingFaceTrainer, but you can swap out to use non ray Huggingface Trainer. Both have the same method signature. \n", - " # the ray native HFTrainer has built in support for scaling to multiple GPUs\n", - " trainer = HuggingFaceTrainer(\n", - " trainer_init_per_worker=trainer_init_per_worker,\n", - " scaling_config=scaling_config,\n", - " datasets={\"train\": ray_train_ds, \"evaluation\": ray_evaluation_ds},\n", + " callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n", + " trainer.add_callback(callback)\n", + "\n", + " trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n", + "\n", + " trainer.train()\n", + "\n", + "\n", + " ray_trainer = TorchTrainer(\n", + " train_func,\n", + " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " # Configure persistent storage that is accessible across \n", + " # all worker nodes.\n", + " # Uncomment and update the RunConfig below to include your storage details.\n", + " # run_config=ray.train.RunConfig(storage_path=\"storage path\"),\n", " )\n", - " result = trainer.fit()" + " result: ray.train.Result = ray_trainer.fit()" ] }, { diff --git a/tests/unit_test.py b/tests/unit_test.py index 4b48b173c..db908df60 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2035,6 +2035,10 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): "items" ], ) + mocker.patch( + "codeflare_sdk.utils.generate_yaml.local_queue_exists", + return_value="true", + ) cluster = get_cluster("quicktest") cluster_config = cluster.config