From f247be87eb110cab27b2342ca609040dd318f8a5 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Fri, 5 Jul 2024 13:16:41 -0500 Subject: [PATCH] Add Phi-3 Configs (#1553) --- config_hub/finetune/README.md | 4 + config_hub/finetune/phi-2/full.yaml | 6 -- config_hub/finetune/phi-3/full.yaml | 103 ++++++++++++++++++++ config_hub/finetune/phi-3/lora.yaml | 134 +++++++++++++++++++++++++++ config_hub/finetune/phi-3/qlora.yaml | 134 +++++++++++++++++++++++++++ 5 files changed, 375 insertions(+), 6 deletions(-) create mode 100644 config_hub/finetune/phi-3/full.yaml create mode 100644 config_hub/finetune/phi-3/lora.yaml create mode 100644 config_hub/finetune/phi-3/qlora.yaml diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md index 55b3d8d286..91ba65df4e 100644 --- a/config_hub/finetune/README.md +++ b/config_hub/finetune/README.md @@ -53,6 +53,10 @@ All experiments were conducted using bfloat-16 precision on the Alpaca2k dataset | phi-2/qlora.yaml | phi-2 | 1 | 512 | 4 | 1xA10G | 4.51 min | $0.1 | 14.27 GB | 0.837 | 2.310 | 52.3% | | phi-2/qlora.yaml | phi-2 | 1 | 512 | 4 | 4xA10G | 4.52 min | $0.4 | 14.27 GB | 0.837 | 2.309 | 52.3% | | | | | | | | | | | | | | +| phi-3/full.yaml | Phi-3-mini-4k-instruct | 1 | 512 | 4 | 1xA10G | 6.93 min | $0.2 | 17.01 GB | 0.714 | 2.043 | 69.81% | +| phi-3/lora.yaml | Phi-3-mini-4k-instruct | 1 | 512 | 4 | 1xA10G | 6.46 min | $0.2 | 19.75 GB | 0.707 | 2.028 | 69.70% | +| phi-3/qlora.yaml | Phi-3-mini-4k-instruct | 1 | 512 | 4 | 1xA10G | 7.47 min | $0.2 | 19.13 GB | 0.729 | 2.074 | 68.96% | +| | | | | | | | | | | | | | stablelm-base-alpha-3b/full.yaml | stablelm-base-alpha-3b | 1 | 512 | 1 | 4xA10G | 70.13 min | $5.6 | 21.23 GB | 1.513 | 4.540 | 23.2% | | stablelm-base-alpha-3b/lora.yaml | stablelm-base-alpha-3b | 4 | 512 | 1 | 1xA10G | 13.07 min | $0.4 | 8.58 GB | 1.361 | 3.900 | 25.9% | | stablelm-base-alpha-3b/lora.yaml | stablelm-base-alpha-3b | 4 | 512 | 1 | 4xA10G | 13.16 min | $1.1 | 8.58 GB | 1.362 | 3.906 | 25.9% | diff --git a/config_hub/finetune/phi-2/full.yaml b/config_hub/finetune/phi-2/full.yaml index 161583737b..b49284cea3 100644 --- a/config_hub/finetune/phi-2/full.yaml +++ b/config_hub/finetune/phi-2/full.yaml @@ -14,12 +14,6 @@ devices: 2 # How many nodes to use. (type: int, default: 1) num_nodes: 1 -# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume -# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing -# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists. -# (type: Union[bool, Literal["auto"], Path], default: False) -resume: false - # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. data: class_path: litgpt.data.Alpaca2k diff --git a/config_hub/finetune/phi-3/full.yaml b/config_hub/finetune/phi-3/full.yaml new file mode 100644 index 0000000000..01a8714584 --- /dev/null +++ b/config_hub/finetune/phi-3/full.yaml @@ -0,0 +1,103 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct + +# Directory in which to save checkpoints and logs. (type: , default: out/finetune/full) +out_dir: out/finetune/full-phi-3 + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# How many devices/GPUs to use (type: Union[int, str], default: 1) +devices: 1 + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 200 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 64) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 1) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 200 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 600) + interval: 25 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.1 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/phi-3/lora.yaml b/config_hub/finetune/phi-3/lora.yaml new file mode 100644 index 0000000000..7c99c0443a --- /dev/null +++ b/config_hub/finetune/phi-3/lora.yaml @@ -0,0 +1,134 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/lora-phi-3 + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 8 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: true + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: true + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: true + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: true + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.03847 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 800 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95 diff --git a/config_hub/finetune/phi-3/qlora.yaml b/config_hub/finetune/phi-3/qlora.yaml new file mode 100644 index 0000000000..00b0a74002 --- /dev/null +++ b/config_hub/finetune/phi-3/qlora.yaml @@ -0,0 +1,134 @@ + +# The path to the base model's checkpoint directory to load for finetuning. (type: , default: checkpoints/stabilityai/stablelm-base-alpha-3b) +checkpoint_dir: checkpoints/microsoft/Phi-3-mini-4k-instruct + +# Directory in which to save checkpoints and logs. (type: , default: out/lora) +out_dir: out/finetune/qlora-phi-3 + +# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null) +precision: bf16-true + +# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null) +quantize: bnb.nf4 + +# How many devices/GPUs to use. (type: Union[int, str], default: 1) +devices: 1 + +# The LoRA rank. (type: int, default: 8) +lora_r: 8 + +# The LoRA alpha. (type: int, default: 16) +lora_alpha: 16 + +# The LoRA dropout value. (type: float, default: 0.05) +lora_dropout: 0.05 + +# Whether to apply LoRA to the query weights in attention. (type: bool, default: True) +lora_query: true + +# Whether to apply LoRA to the key weights in attention. (type: bool, default: False) +lora_key: true + +# Whether to apply LoRA to the value weights in attention. (type: bool, default: True) +lora_value: true + +# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False) +lora_projection: true + +# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False) +lora_mlp: true + +# Whether to apply LoRA to output head in GPT. (type: bool, default: False) +lora_head: true + +# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``. +data: + class_path: litgpt.data.Alpaca2k + init_args: + mask_prompt: false + val_split_fraction: 0.03847 + prompt_style: alpaca + ignore_index: -100 + seed: 42 + num_workers: 4 + +# Training-related arguments. See ``litgpt.args.TrainArgs`` for details +train: + + # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000) + save_interval: 800 + + # Number of iterations between logging calls (type: int, default: 1) + log_interval: 1 + + # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128) + global_batch_size: 8 + + # Number of samples per data-parallel rank (type: int, default: 4) + micro_batch_size: 4 + + # Number of iterations with learning rate warmup active (type: int, default: 100) + lr_warmup_steps: 10 + + # Number of epochs to train on (type: Optional[int], default: 5) + epochs: 1 + + # Total number of tokens to train on (type: Optional[int], default: null) + max_tokens: + + # Limits the number of optimizer steps to run. (type: Optional[int], default: null) + max_steps: + + # Limits the length of samples. Off by default (type: Optional[int], default: null) + max_seq_length: 512 + + # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null) + tie_embeddings: + + # (type: Optional[float], default: null) + max_norm: + + # (type: float, default: 6e-05) + min_lr: 6.0e-05 + +# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details +eval: + + # Number of optimizer steps between evaluation calls (type: int, default: 100) + interval: 100 + + # Number of tokens to generate (type: Optional[int], default: 100) + max_new_tokens: 100 + + # Number of iterations (type: int, default: 100) + max_iters: 100 + + # Whether to evaluate on the validation set at the beginning of the training + initial_validation: false + + # Whether to evaluate on the validation set at the end the training + final_validation: true + +# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv) +logger_name: csv + +# The random seed to use for reproducibility. (type: int, default: 1337) +seed: 1337 + +# Optimizer-related arguments +optimizer: + + class_path: torch.optim.AdamW + + init_args: + + # (type: float, default: 0.001) + lr: 0.0002 + + # (type: float, default: 0.01) + weight_decay: 0.0 + + # (type: tuple, default: (0.9,0.999)) + betas: + - 0.9 + - 0.95