changing to 256 nodes O.o

RWKV · Sep 13, 2023 · af4e955 · af4e955
1 parent 53ee8f4
commit af4e955
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 19 deletions.
diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-enwiki-instruct.yaml b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-enwiki-instruct.yaml
@@ -4,7 +4,7 @@ trainer:
   # Configure the number of GPU, avaliable on your machine
   accelerator: gpu
   devices: auto
-  num_nodes: 64
+  num_nodes: 256
 
   #
   # Configure the deepspeed strategy, 
@@ -90,7 +90,7 @@ trainer:
   #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
-  target_batch_size: 512
+  target_batch_size: 2048
 
 ########################################
 ## Training model settings
@@ -113,13 +113,13 @@ model:
   # Learning rate of the training process
   # ---
   # Initia learning rate of the process
-  lr_init: 4e-4
+  lr_init: 8e-4
   # Final learning rate after the learning rate period
   # learning rate will stay at final value from then onwards
   #
   # NOTE: lr_final / lr_period does not work with warmup_steps
   #       and will be ignored (or replaced) with the warmup_steps logic instead
-  lr_final: 3e-4
+  lr_final: 5e-4
   # Number of epoch to reduce the learning rate from lr_init to lr_final
   #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)
   #  0 means lr_final will apply immediately

diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-mem-instruct.yaml b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-mem-instruct.yaml
@@ -4,7 +4,7 @@ trainer:
   # Configure the number of GPU, avaliable on your machine
   accelerator: gpu
   devices: auto
-  num_nodes: 64
+  num_nodes: 256
 
   # Configure the deepspeed strategy, 
   strategy: deepspeed_stage_1
@@ -88,7 +88,7 @@ trainer:
   #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
-  target_batch_size: 512
+  target_batch_size: 2048
 
 ########################################
 ## Training model settings

diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-mem-template.yaml b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-mem-template.yaml
@@ -4,7 +4,7 @@ trainer:
   # Configure the number of GPU, avaliable on your machine
   accelerator: gpu
   devices: auto
-  num_nodes: 64
+  num_nodes: 256
 
   # Configure the deepspeed strategy
   strategy: deepspeed_stage_1
@@ -88,7 +88,7 @@ trainer:
   #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
-  target_batch_size: 512
+  target_batch_size: 2048
 
 ########################################
 ## Training model settings
@@ -117,7 +117,7 @@ model:
   #
   # NOTE: lr_final / lr_period does not work with warmup_steps
   #       and will be ignored (or replaced) with the warmup_steps logic instead
-  lr_final: 4e-4
+  lr_final: 2e-4
 
   # Number of epoch to reduce the learning rate from lr_init to lr_final
   #  1 means a single epoch (so lr would be lr_final from epoch 2 onwards)

diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-slimpajama.yaml b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-slimpajama.yaml
@@ -4,7 +4,7 @@ trainer:
   # Configure the number of GPU, avaliable on your machine
   accelerator: gpu
   devices: auto
-  num_nodes: 64
+  num_nodes: 256
 
   #
   # Configure the deepspeed strategy, 
@@ -86,7 +86,7 @@ trainer:
   #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
-  target_batch_size: 512
+  target_batch_size: 2048
 
 ########################################
 ## Training model settings
@@ -109,7 +109,7 @@ model:
   # Learning rate of the training process
   # ---
   # Initia learning rate of the process
-  lr_init: 4e-4
+  lr_init: 5e-4
   # Final learning rate after the learning rate period
   # learning rate will stay at final value from then onwards
   #

diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-starcoder.yaml b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/config/config-starcoder.yaml
@@ -4,7 +4,7 @@ trainer:
   # Configure the number of GPU, avaliable on your machine
   accelerator: gpu
   devices: auto
-  num_nodes: 64
+  num_nodes: 256
 
   #
   # Configure the deepspeed strategy, 
@@ -86,7 +86,7 @@ trainer:
   #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
-  target_batch_size: 512
+  target_batch_size: 2048
 
 ########################################
 ## Training model settings
@@ -109,7 +109,7 @@ model:
   # Learning rate of the training process
   # ---
   # Initia learning rate of the process
-  lr_init: 3e-4
+  lr_init: 4e-4
   # Final learning rate after the learning rate period
   # learning rate will stay at final value from then onwards
   #

diff --git a/notebook/foundation-v5-models/3B-L48-D2048-coding-model/stage2-memory-instruct.ipynb b/notebook/foundation-v5-models/3B-L48-D2048-coding-model/stage2-memory-instruct.ipynb
@@ -316,10 +316,6 @@
     "        --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
     "        --trainer.devices=\"{GPU_DEVICES}\"  \\\n",
     "        --trainer.callbacks.init_args.dirpath=\"../checkpoint/{FILENAME_PREFIX}-mem-ctx-512/\" \\\n",
-    "        --model.lr_init=5e-4 \\\n",
-    "        --model.lr_final=4e-4 \\\n",
-    "        --data.max_token_size=512 \\\n",
-    "        --model.ctx_len=512 \\\n",
     "        --model.bptt_learning_range=1 \\\n",
     "        --model.load_model=\"../model/{FILENAME_PREFIX}-mem-instruct.pth\" \\\n",
     "        --auto-resume-ckpt-dir \"auto\""