Skip to content

Commit

Permalink
changing to 256 nodes O.o
Browse files Browse the repository at this point in the history
  • Loading branch information
PicoCreator committed Sep 13, 2023
1 parent 53ee8f4 commit af4e955
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 64
num_nodes: 256

#
# Configure the deepspeed strategy,
Expand Down Expand Up @@ -90,7 +90,7 @@ trainer:
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 512
target_batch_size: 2048

########################################
## Training model settings
Expand All @@ -113,13 +113,13 @@ model:
# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 4e-4
lr_init: 8e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
# NOTE: lr_final / lr_period does not work with warmup_steps
# and will be ignored (or replaced) with the warmup_steps logic instead
lr_final: 3e-4
lr_final: 5e-4
# Number of epoch to reduce the learning rate from lr_init to lr_final
# 1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
# 0 means lr_final will apply immediately
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 64
num_nodes: 256

# Configure the deepspeed strategy,
strategy: deepspeed_stage_1
Expand Down Expand Up @@ -88,7 +88,7 @@ trainer:
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 512
target_batch_size: 2048

########################################
## Training model settings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 64
num_nodes: 256

# Configure the deepspeed strategy
strategy: deepspeed_stage_1
Expand Down Expand Up @@ -88,7 +88,7 @@ trainer:
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 512
target_batch_size: 2048

########################################
## Training model settings
Expand Down Expand Up @@ -117,7 +117,7 @@ model:
#
# NOTE: lr_final / lr_period does not work with warmup_steps
# and will be ignored (or replaced) with the warmup_steps logic instead
lr_final: 4e-4
lr_final: 2e-4

# Number of epoch to reduce the learning rate from lr_init to lr_final
# 1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 64
num_nodes: 256

#
# Configure the deepspeed strategy,
Expand Down Expand Up @@ -86,7 +86,7 @@ trainer:
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 512
target_batch_size: 2048

########################################
## Training model settings
Expand All @@ -109,7 +109,7 @@ model:
# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 4e-4
lr_init: 5e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ trainer:
# Configure the number of GPU, avaliable on your machine
accelerator: gpu
devices: auto
num_nodes: 64
num_nodes: 256

#
# Configure the deepspeed strategy,
Expand Down Expand Up @@ -86,7 +86,7 @@ trainer:
#
# You are also recommended to configure this to a large enough number to fully utilize
# your GPU processing time %, and avoid idle time for the GPU between batches
target_batch_size: 512
target_batch_size: 2048

########################################
## Training model settings
Expand All @@ -109,7 +109,7 @@ model:
# Learning rate of the training process
# ---
# Initia learning rate of the process
lr_init: 3e-4
lr_init: 4e-4
# Final learning rate after the learning rate period
# learning rate will stay at final value from then onwards
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,10 +316,6 @@
" --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
" --trainer.devices=\"{GPU_DEVICES}\" \\\n",
" --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/\" \\\n",
" --model.lr_init=5e-4 \\\n",
" --model.lr_final=4e-4 \\\n",
" --data.max_token_size=512 \\\n",
" --model.ctx_len=512 \\\n",
" --model.bptt_learning_range=1 \\\n",
" --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \\\n",
" --auto-resume-ckpt-dir \"auto\""
Expand Down

0 comments on commit af4e955

Please sign in to comment.