From f247be87eb110cab27b2342ca609040dd318f8a5 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Fri, 5 Jul 2024 13:16:41 -0500
Subject: [PATCH] Add Phi-3 Configs (#1553)

---
 config_hub/finetune/README.md        |   4 +
 config_hub/finetune/phi-2/full.yaml  |   6 --
 config_hub/finetune/phi-3/full.yaml  | 103 ++++++++++++++++++++
 config_hub/finetune/phi-3/lora.yaml  | 134 +++++++++++++++++++++++++++
 config_hub/finetune/phi-3/qlora.yaml | 134 +++++++++++++++++++++++++++
 5 files changed, 375 insertions(+), 6 deletions(-)
 create mode 100644 config_hub/finetune/phi-3/full.yaml
 create mode 100644 config_hub/finetune/phi-3/lora.yaml
 create mode 100644 config_hub/finetune/phi-3/qlora.yaml

diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
index 55b3d8d286..91ba65df4e 100644
--- a/config_hub/finetune/README.md
+++ b/config_hub/finetune/README.md
@@ -53,6 +53,10 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset
 | phi-2/qlora.yaml                  | phi-2                  | 1      | 512            | 4                | 1xA10G  | 4.51 min         | $0.1 | 14.27 GB    | 0.837           | 2.310                 | 52.3%           |
 | phi-2/qlora.yaml                  | phi-2                  | 1      | 512            | 4                | 4xA10G  | 4.52 min         | $0.4 | 14.27 GB    | 0.837           | 2.309                 | 52.3%           |
 |                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
+| phi-3/full.yaml                   | Phi-3-mini-4k-instruct | 1      | 512            | 4                | 1xA10G  | 6.93 min         | $0.2 | 17.01 GB    | 0.714           | 2.043                 | 69.81%          |
+| phi-3/lora.yaml                   | Phi-3-mini-4k-instruct | 1      | 512            | 4                | 1xA10G  | 6.46 min         | $0.2 | 19.75 GB    | 0.707           | 2.028                 | 69.70%          |
+| phi-3/qlora.yaml                  | Phi-3-mini-4k-instruct | 1      | 512            | 4                | 1xA10G  | 7.47 min         | $0.2 | 19.13 GB    | 0.729           | 2.074                 | 68.96%          |
+|                                   |                        |        |                |                  |         |                  |      |             |                 |                       |                 |
 | stablelm-base-alpha-3b/full.yaml  | stablelm-base-alpha-3b | 1      | 512            | 1                | 4xA10G  | 70.13 min        | $5.6 | 21.23 GB    | 1.513           | 4.540                 | 23.2%           |
 | stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | 4      | 512            | 1                | 1xA10G  | 13.07 min        | $0.4 | 8.58 GB     | 1.361           | 3.900                 | 25.9%           |
 | stablelm-base-alpha-3b/lora.yaml  | stablelm-base-alpha-3b | 4      | 512            | 1                | 4xA10G  | 13.16 min        | $1.1 | 8.58 GB     | 1.362           | 3.906                 | 25.9%           |
diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml
index 161583737b..b49284cea3 100644
--- a/config_hub/finetune/phi-2/full.yaml
+++ b/config_hub/finetune/phi-2/full.yaml
@@ -14,12 +14,6 @@ devices: 2
 # How many nodes to use. (type: int, default: 1)
 num_nodes: 1
 
-# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
-# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
-# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
-# (type: Union[bool, Literal["auto"], Path], default: False)
-resume: false
-
 # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 data:
   class_path: litgpt.data.Alpaca2k
diff --git a/config_hub/finetune/phi-3/full.yaml b/config_hub/finetune/phi-3/full.yaml
new file mode 100644
index 0000000000..01a8714584
--- /dev/null
+++ b/config_hub/finetune/phi-3/full.yaml
@@ -0,0 +1,103 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/finetune/full)
+out_dir: out/finetune/full-phi-3
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# How many devices/GPUs to use (type: Union[int, str], default: 1)
+devices: 1
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 1)
+  micro_batch_size: 4
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 200
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 1
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 600)
+  interval: 25
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.1
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/phi-3/lora.yaml b/config_hub/finetune/phi-3/lora.yaml
new file mode 100644
index 0000000000..7c99c0443a
--- /dev/null
+++ b/config_hub/finetune/phi-3/lora.yaml
@@ -0,0 +1,134 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-phi-3
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: true
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: true
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    val_split_fraction: 0.03847
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 800
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 4
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 1
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
diff --git a/config_hub/finetune/phi-3/qlora.yaml b/config_hub/finetune/phi-3/qlora.yaml
new file mode 100644
index 0000000000..00b0a74002
--- /dev/null
+++ b/config_hub/finetune/phi-3/qlora.yaml
@@ -0,0 +1,134 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/qlora-phi-3
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize: bnb.nf4
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.05
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: true
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: true
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    val_split_fraction: 0.03847
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 800
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 4
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 1
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+  # Whether to evaluate on the validation set at the beginning of the training
+  initial_validation: false
+
+  # Whether to evaluate on the validation set at the end the training
+  final_validation: true
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+  
+  init_args:
+    
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+    
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+    
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95