From dc047768c1290d86afe8c4441094abe473ff5e42 Mon Sep 17 00:00:00 2001
From: Junjie Tang <87007858+JunjieTang-D1@users.noreply.github.com>
Date: Mon, 23 Sep 2024 12:47:40 +0200
Subject: [PATCH] Feat/fine tuning 6b (#243)

* fix the project name

* initial config for fine-tune a 6B model

* update CHANGELOG.md

* add new line end to the file end

* remove the old readme and add a blog later

* change the deployment name and add all the deployment groups
---
 CHANGELOG.md                                  |   1 +
 manifests/fine-tuning-6B/base-modules.yaml    |  26 ++
 manifests/fine-tuning-6B/core-modules.yaml    | 112 +++++++++
 manifests/fine-tuning-6B/deployment.yaml      |  30 +++
 manifests/fine-tuning-6B/images-modules.yaml  |  10 +
 .../fine-tuning-6B/integration-modules.yaml   |  63 +++++
 .../fine-tuning-6B/ray-cluster-modules.yaml   |  82 +++++++
 .../fine-tuning-6B/ray-operator-modules.yaml  |  60 +++++
 .../fine-tuning-6B/scripts/inference-6B.py    |  21 ++
 .../fine-tuning-6B/scripts/training-6B.py     | 230 ++++++++++++++++++
 10 files changed, 635 insertions(+)
 create mode 100644 manifests/fine-tuning-6B/base-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/core-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/deployment.yaml
 create mode 100644 manifests/fine-tuning-6B/images-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/integration-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/ray-cluster-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/ray-operator-modules.yaml
 create mode 100644 manifests/fine-tuning-6B/scripts/inference-6B.py
 create mode 100644 manifests/fine-tuning-6B/scripts/training-6B.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7db33318..340ab526 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## UNRELEASED
 
 ### **Added**
+- added new manifest `manifests/fine-tuning-6B`
 
 ### **Changed**
 
diff --git a/manifests/fine-tuning-6B/base-modules.yaml b/manifests/fine-tuning-6B/base-modules.yaml
new file mode 100644
index 00000000..f9f2e8f5
--- /dev/null
+++ b/manifests/fine-tuning-6B/base-modules.yaml
@@ -0,0 +1,26 @@
+name: networking
+path: git::https://github.com/awslabs/idf-modules.git//modules/network/basic-cdk?ref=release/1.11.0&depth=1
+parameters:
+  - name: InternetAccessible
+    value: true
+---
+name: buckets
+path: git::https://github.com/awslabs/idf-modules.git//modules/storage/buckets?ref=release/1.11.0&depth=1
+parameters:
+  - name: EncryptionType
+    value: SSE
+  - name: RetentionType
+    value: DESTROY
+---
+name: ray-ecr
+path: git::https://github.com/awslabs/idf-modules.git//modules/storage/ecr?ref=release/1.11.0&depth=1
+targetAccount: primary
+parameters:
+  - name: ImageTagMutability
+    value: MUTABLE
+  - name: ImageScanOnPush
+    value: True
+  - name: Encryption
+    value: KMS_MANAGED
+  - name: RemovalPolicy
+    value: DESTROY
diff --git a/manifests/fine-tuning-6B/core-modules.yaml b/manifests/fine-tuning-6B/core-modules.yaml
new file mode 100644
index 00000000..ba3ac288
--- /dev/null
+++ b/manifests/fine-tuning-6B/core-modules.yaml
@@ -0,0 +1,112 @@
+name: eks
+path: git::https://github.com/awslabs/idf-modules.git//modules/compute/eks?ref=release/1.11.0&depth=1
+dataFiles:
+  - filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/1.29.yaml?ref=release/1.11.0&depth=1
+  - filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/default.yaml?ref=release/1.11.0&depth=1
+parameters:
+  - name: VpcId
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: networking
+        key: VpcId
+  - name: ControlplaneSubnetIds
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: networking
+        key: PrivateSubnetIds
+  - name: DataplaneSubnetIds
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: networking
+        key: PrivateSubnetIds
+  - name: EksAdminRoleName
+    value: Admin
+  - name: EksPoweruserRoleName
+    value: PowerUser
+  - name: EksReadOnlyRoleName
+    value: ReadOnly
+  - name: EksVersion
+    value: "1.29"
+    # valueFrom:
+    #   envVariable: GLOBAL_EKS_VERSION
+  - name: EksCompute
+    value:
+      eks_nodegroup_config:
+        - eks_ng_name: ng1
+          eks_node_quantity: 1
+          eks_node_max_quantity: 1
+          eks_node_min_quantity: 1
+          eks_node_disk_size: 400
+          eks_node_instance_type: "m5.xlarge"
+          eks_node_labels:
+            usage: core
+        - eks_ng_name: ng-gpu
+          eks_node_quantity: 6
+          eks_node_max_quantity: 15
+          eks_node_min_quantity: 6
+          eks_node_disk_size: 400
+          eks_node_instance_type: "g4dn.4xlarge"
+          eks_node_labels:
+            usage: gpu
+            nvidia.com/gpu.present: "true"
+          use_gpu_ami: True
+          eks_node_taints:
+            - key: "nvidia.com/gpu"
+              value: "true"
+              # operator: "Equal"
+              effect: "NoSchedule"
+          install_nvidia_device_plugin: True
+      eks_node_spot: False
+      eks_secrets_envelope_encryption: True
+      eks_api_endpoint_private: False
+  - name: EksAddons
+    value:
+      # Autoscaling
+      deploy_cluster_autoscaler: True
+      deploy_metrics_server: True
+      # Observability
+      deploy_cloudwatch_observability_addon: True
+      # Storage
+      deploy_aws_fsx_csi: True
+---
+name: fsx-lustre
+path: git::https://github.com/awslabs/idf-modules.git//modules/storage/fsx-lustre?ref=release/1.11.0&depth=1
+parameters:
+  - name: VpcId
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: networking
+        key: VpcId
+  - name: PrivateSubnetIds
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: networking
+        key: PrivateSubnetIds
+  - name: FsDeploymentType
+    value: SCRATCH_2
+  - name: StorageThroughput
+    value: 50
+  - name: DataBucketName
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: buckets
+        key: ArtifactsBucketName
+  - name: DraExportPath
+    valueFrom:
+      parameterValue: draExportPath
+  - name: DraImportPath
+    valueFrom:
+      parameterValue: draImportPath
+  - name: FsxVersion
+    value: "2.15"
+  - name: Namespace
+    valueFrom:
+      parameterValue: rayNamespaceName
+  - name: ImportPolicy
+    value: "NEW_CHANGED_DELETED"
diff --git a/manifests/fine-tuning-6B/deployment.yaml b/manifests/fine-tuning-6B/deployment.yaml
new file mode 100644
index 00000000..40672468
--- /dev/null
+++ b/manifests/fine-tuning-6B/deployment.yaml
@@ -0,0 +1,30 @@
+name: fine-tuning-6B
+forceDependencyRedeploy: True
+toolchainRegion: us-east-1
+groups:
+  - name: base
+    path: manifests/fine-tuning-6B/base-modules.yaml
+  - name: images
+    path: manifests/fine-tuning-6B/images-modules.yaml
+  - name: core
+    path: manifests/fine-tuning-6B/core-modules.yaml
+  - name: integration
+    path: manifests/fine-tuning-6B/integration-modules.yaml
+  - name: ray-operator
+    path: manifests/fine-tuning-6B/ray-operator-modules.yaml
+  - name: ray-cluster
+    path: manifests/fine-tuning-6B/ray-cluster-modules.yaml
+targetAccountMappings:
+  - alias: primary
+    accountId:
+      valueFrom:
+        envVariable: PRIMARY_ACCOUNT
+    default: true
+    codebuildImage: aws/codebuild/standard:7.0
+    parametersGlobal:
+      rayNamespaceName: ray
+      draImportPath: /ray/import/
+      draExportPath: /ray/export/
+    regionMappings:
+      - region: us-east-1
+        default: true
diff --git a/manifests/fine-tuning-6B/images-modules.yaml b/manifests/fine-tuning-6B/images-modules.yaml
new file mode 100644
index 00000000..a4705241
--- /dev/null
+++ b/manifests/fine-tuning-6B/images-modules.yaml
@@ -0,0 +1,10 @@
+name: ray
+path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-image?ref=release/1.5.0&depth=1
+targetAccount: primary
+parameters:
+  - name: EcrRepoName
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: ray-ecr
+        key: EcrRepositoryName
diff --git a/manifests/fine-tuning-6B/integration-modules.yaml b/manifests/fine-tuning-6B/integration-modules.yaml
new file mode 100644
index 00000000..f4b1fb89
--- /dev/null
+++ b/manifests/fine-tuning-6B/integration-modules.yaml
@@ -0,0 +1,63 @@
+name: lustre-on-eks
+path: git::https://github.com/awslabs/idf-modules.git//modules/integration/fsx-lustre-on-eks?ref=release/1.11.0&depth=1
+parameters:
+  - name: EksClusterAdminRoleArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterMasterRoleArn
+  - name: EksHandlerRoleArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksHandlerRoleArn
+  - name: EksClusterName
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterName
+  - name: EksOidcArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksOidcArn
+  - name: EksClusterSecurityGroupId
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterSecurityGroupId
+  - name: Namespace
+    valueFrom:
+      parameterValue: rayNamespaceName
+  - name: FsxFileSystemId
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: fsx-lustre
+        key: FSxLustreFileSystemId
+  - name: FsxSecurityGroupId
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: fsx-lustre
+        key: FSxLustreSecurityGroup
+  - name: FsxMountName
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: fsx-lustre
+        key: FSxLustreMountName
+  - name: FsxDnsName
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: fsx-lustre
+        key: FSxLustreAttrDnsName
+  - name: DraExportPath
+    valueFrom:
+      parameterValue: draExportPath
diff --git a/manifests/fine-tuning-6B/ray-cluster-modules.yaml b/manifests/fine-tuning-6B/ray-cluster-modules.yaml
new file mode 100644
index 00000000..c72b7688
--- /dev/null
+++ b/manifests/fine-tuning-6B/ray-cluster-modules.yaml
@@ -0,0 +1,82 @@
+name: ray-cluster
+path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-cluster?ref=release/1.5.0&depth=1
+parameters:
+  - name: EksClusterAdminRoleArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterMasterRoleArn
+  - name: EksClusterName
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterName
+  - name: EksOidcArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksOidcArn
+  - name: Namespace
+    valueFrom:
+      parameterValue: rayNamespaceName
+  - name: ServiceAccountName
+    valueFrom:
+      moduleMetadata:
+        group: ray-operator
+        name: ray-operator
+        key: EksServiceAccountName
+  - name: HeadResources
+    value:
+      requests:
+        cpu: "1"
+        memory: "8G"
+      limits:
+        cpu: "4"
+        memory: "16G"
+  - name: WorkerReplicas
+    value: 1
+  - name: WorkerMinReplicas
+    value: 1
+  - name: WorkerMaxReplicas
+    value: 15
+  - name: WorkerResources
+    value:
+      requests:
+        cpu: "4"
+        memory: "8G"
+      limits:
+        cpu: "14"
+        memory: "60G"
+  - name: DataBucketName
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: buckets
+        key: ArtifactsBucketName
+  - name: ImageUri
+    valueFrom:
+      moduleMetadata:
+        group: images
+        name: ray
+        key: ImageUri
+  - name: WorkerTolerations
+    value:  # make sure to match w/ the taints on the GPU Nodegroup
+      - key: "nvidia.com/gpu"
+        value: "true"
+        # operator: "Equal"
+        effect: "NoSchedule"
+  - name: WorkerLabels
+    value:  # make sure to match w/ the labels on the GPU Nodegroup
+      usage: gpu
+  - name: PvcName
+    valueFrom:
+      moduleMetadata:
+        group: integration
+        name: lustre-on-eks
+        key: PersistentVolumeClaimName
+  - name: DraExportPath
+    valueFrom:
+      parameterValue: draExportPath
diff --git a/manifests/fine-tuning-6B/ray-operator-modules.yaml b/manifests/fine-tuning-6B/ray-operator-modules.yaml
new file mode 100644
index 00000000..45bfb2e6
--- /dev/null
+++ b/manifests/fine-tuning-6B/ray-operator-modules.yaml
@@ -0,0 +1,60 @@
+name: ray-operator
+path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-operator?ref=release/1.5.0&depth=1
+parameters:
+  - name: EksClusterAdminRoleArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterMasterRoleArn
+  - name: EksHandlerRoleArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksHandlerRoleArn
+  - name: EksClusterName
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterName
+  - name: EksClusterEndpoint
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterEndpoint
+  - name: EksOidcArn
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksOidcArn
+  - name: EksOpenidIssuer
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterOpenIdConnectIssuer
+  - name: EksCertAuthData
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterCertAuthData
+  - name: EksClusterSecurityGroupId
+    valueFrom:
+      moduleMetadata:
+        group: core
+        name: eks
+        key: EksClusterSecurityGroupId
+  - name: Namespace
+    valueFrom:
+      parameterValue: rayNamespaceName
+  - name: DataBucketName
+    valueFrom:
+      moduleMetadata:
+        group: base
+        name: buckets
+        key: ArtifactsBucketName
diff --git a/manifests/fine-tuning-6B/scripts/inference-6B.py b/manifests/fine-tuning-6B/scripts/inference-6B.py
new file mode 100644
index 00000000..b33a8546
--- /dev/null
+++ b/manifests/fine-tuning-6B/scripts/inference-6B.py
@@ -0,0 +1,21 @@
+import torch
+import torchvision
+
+from transformers import pipeline, AutoTokenizer, GPTJForCausalLM
+
+model = GPTJForCausalLM.from_pretrained("/ray/export/.../checkpoint")
+tokenizer = AutoTokenizer.from_pretrained("/ray/export/.../checkpoint")
+
+pipe = pipeline(
+    model=model,
+    tokenizer=tokenizer,
+    task="text-generation",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+
+# Generate from prompts!
+for sentence in pipe(
+    ["Romeo and Juliet", "war", "blood"], do_sample=True, min_length=20
+):
+    print(sentence)
\ No newline at end of file
diff --git a/manifests/fine-tuning-6B/scripts/training-6B.py b/manifests/fine-tuning-6B/scripts/training-6B.py
new file mode 100644
index 00000000..e04ee198
--- /dev/null
+++ b/manifests/fine-tuning-6B/scripts/training-6B.py
@@ -0,0 +1,230 @@
+import numpy as np
+import pandas as pd
+import os
+
+import ray
+import ray.data
+from datasets import load_dataset
+import evaluate
+import torch
+from transformers import (
+    Trainer,
+    TrainingArguments,
+    GPTJForCausalLM,
+    AutoTokenizer,
+    default_data_collator,
+)
+from transformers.utils.logging import disable_progress_bar, enable_progress_bar
+from ray import train
+from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback
+from ray.train.torch import TorchTrainer
+from ray.train import RunConfig, ScalingConfig
+
+
+model_name = "EleutherAI/gpt-j-6B"
+use_gpu = True
+num_workers = 5
+cpus_per_worker = 12
+block_size = 512
+storage_path = "/ray/export"
+
+
+ray.init(
+    runtime_env={
+        "pip": [
+            "datasets",
+            "evaluate",
+            # The latest combination accelerate==0.25.0, transformers==4.36.0, deepspeed==0.12.4
+            # has issues with DeepSpeed process group initialization,
+            # and will result in a batch_size validation problem.
+            # TODO(ml-team): get rid of the pins once the issue is fixed.
+            "accelerate==0.18.0",
+            "transformers==4.26.0",
+            "torch>=1.12.0",
+            "deepspeed==0.12.3",
+        ],
+    },
+)
+
+print("Loading tiny_shakespeare dataset")
+current_dataset = load_dataset("tiny_shakespeare")
+
+ray_datasets = {
+    "train": ray.data.from_huggingface(current_dataset["train"]),
+    "validation": ray.data.from_huggingface(current_dataset["validation"]),
+}
+
+
+def split_text(batch: pd.DataFrame) -> pd.DataFrame:
+    text = list(batch["text"])
+    flat_text = "".join(text)
+    split_text = [
+        x.strip()
+        for x in flat_text.split("\n")
+        if x.strip() and not x.strip()[-1] == ":"
+    ]
+    return pd.DataFrame(split_text, columns=["text"])
+
+
+def tokenize(batch: pd.DataFrame) -> dict:
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    tokenizer.pad_token = tokenizer.eos_token
+    ret = tokenizer(
+        list(batch["text"]),
+        truncation=True,
+        max_length=block_size,
+        padding="max_length",
+        return_tensors="np",
+    )
+    ret["labels"] = ret["input_ids"].copy()
+    return dict(ret)
+
+
+processed_datasets = {
+    key: (
+        ds.map_batches(split_text, batch_format="pandas")
+        .map_batches(tokenize, batch_format="pandas")
+    )
+    for key, ds in ray_datasets.items()
+}
+
+
+def train_func(config):
+    # Use the actual number of CPUs assigned by Ray
+    os.environ["OMP_NUM_THREADS"] = str(
+        train.get_context().get_trial_resources().bundles[-1].get("CPU", 1)
+    )
+    # Enable tf32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+
+    batch_size = config.get("batch_size", 4)
+    epochs = config.get("epochs", 2)
+    warmup_steps = config.get("warmup_steps", 0)
+    learning_rate = config.get("learning_rate", 0.00002)
+    weight_decay = config.get("weight_decay", 0.01)
+    steps_per_epoch = config.get("steps_per_epoch")
+
+    deepspeed = {
+        "fp16": {
+            "enabled": "auto",
+            "initial_scale_power": 8,
+            "hysteresis": 4,
+            "consecutive_hysteresis": True,
+        },
+        "bf16": {"enabled": "auto"},
+        "optimizer": {
+            "type": "AdamW",
+            "params": {
+                "lr": "auto",
+                "betas": "auto",
+                "eps": "auto",
+            },
+        },
+        "zero_optimization": {
+            "stage": 3,
+            "offload_optimizer": {
+                "device": "cpu",
+                "pin_memory": False, #out of mmeory
+            },
+            "overlap_comm": False, # running out of GRAM
+            "contiguous_gradients": True,
+            "reduce_bucket_size": "auto",
+            "stage3_prefetch_bucket_size": "auto",
+            "stage3_param_persistence_threshold": "auto",
+            "gather_16bit_weights_on_model_save": True,
+            "round_robin_gradients": True,
+        },
+        "gradient_accumulation_steps": "auto",
+        "gradient_clipping": "auto",
+        "steps_per_print": 10,
+        "train_batch_size": "auto",
+        "train_micro_batch_size_per_gpu": "auto",
+        "wall_clock_breakdown": False,
+    }
+
+    print("Preparing training arguments")
+    training_args = TrainingArguments(
+        "output",
+        logging_steps=1,
+        save_strategy="steps",
+        save_steps=steps_per_epoch,
+        max_steps=steps_per_epoch * epochs,
+        per_device_train_batch_size=batch_size,
+        gradient_accumulation_steps=1,
+        learning_rate=learning_rate,
+        weight_decay=weight_decay,
+        warmup_steps=warmup_steps,
+        label_names=["input_ids", "attention_mask"],
+        push_to_hub=False,
+        report_to="none",
+        disable_tqdm=True,  # declutter the output a little
+        fp16=True,
+        gradient_checkpointing=True,
+        deepspeed=deepspeed,
+    )
+    disable_progress_bar()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    print("Loading model")
+
+    model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False)
+    model.resize_token_embeddings(len(tokenizer))
+
+    print("Model loaded")
+
+    enable_progress_bar()
+
+    metric = evaluate.load("accuracy")
+
+    train_ds = train.get_dataset_shard("train")
+    eval_ds = train.get_dataset_shard("validation")
+
+    train_ds_iterable = train_ds.iter_torch_batches(
+        batch_size=batch_size,
+        local_shuffle_buffer_size=train.get_context().get_world_size() * batch_size,
+    )
+    eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size)
+
+    def compute_metrics(eval_pred):
+        logits, labels = eval_pred
+        predictions = np.argmax(logits, axis=-1)
+        return metric.compute(predictions=predictions, references=labels)
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_ds_iterable,
+        eval_dataset=eval_ds_iterable,
+        compute_metrics=compute_metrics,
+        tokenizer=tokenizer,
+        data_collator=default_data_collator,
+    )
+
+    # Add callback to report checkpoints to Ray Train
+    trainer.add_callback(RayTrainReportCallback())
+    trainer = prepare_trainer(trainer)
+    trainer.train()
+
+batch_size = 12
+train_ds_size = processed_datasets["train"].count()
+steps_per_epoch = train_ds_size // (batch_size * num_workers)
+
+trainer = TorchTrainer(
+    train_loop_per_worker=train_func,
+    train_loop_config={
+        "epochs": 1,
+        "batch_size": batch_size,  # per device
+        "steps_per_epoch": steps_per_epoch,
+    },
+    scaling_config=ScalingConfig(
+        num_workers=num_workers,
+        use_gpu=use_gpu,
+        resources_per_worker={"GPU": 1, "CPU": cpus_per_worker},
+    ),
+    datasets=processed_datasets,
+    run_config=RunConfig(storage_path=storage_path),
+)
+
+results = trainer.fit()
\ No newline at end of file