Skip to content

Commit

Permalink
make tp yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
ez2rok committed Sep 25, 2024
1 parent ff36f17 commit 1474a80
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 15 deletions.
19 changes: 4 additions & 15 deletions scripts/train/yamls/pretrain/mpt-125m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ train_loader:
dataset:
local: ${variables.data_local}
remote: ${variables.data_remote}
split: train_small
split: train
shuffle: true
max_seq_len: ${variables.max_seq_len}
shuffle_seed: ${variables.global_seed}
Expand All @@ -47,11 +47,10 @@ eval_loader:
dataset:
local: ${variables.data_local}
remote: ${variables.data_remote}
split: val_small
split: val
shuffle: false
max_seq_len: ${variables.max_seq_len}
shuffle_seed: ${variables.global_seed}
replication: 2
drop_last: false
num_workers: 8

Expand All @@ -75,10 +74,10 @@ algorithms:
clipping_type: norm
clipping_threshold: 1.0

max_duration: 100ba
max_duration: 4800ba # ~ 2.5B tokens
eval_interval: 500ba
eval_first: false
eval_subset_num_batches: 0
eval_subset_num_batches: -1
global_train_batch_size: 256

# System
Expand All @@ -97,11 +96,6 @@ fsdp_config:
activation_cpu_offload: false
limit_all_gathers: true

# TP
tp_config:
strategy: ffn
tensor_parallel_degree: 2

# Logging
progress_bar: false
log_to_console: true
Expand All @@ -114,11 +108,6 @@ callbacks:
memory_monitor: {}
runtime_estimator: {}

loggers:
mlflow:
experiment_name: tp
# model_registry_prefix: datasets.eitanturok.${run_name}

# loggers:
# wandb: {}

Expand Down
133 changes: 133 additions & 0 deletions scripts/train/yamls/pretrain/tp-mpt-125m.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
variables:
data_local: ./my-copy-c4
data_remote: # If blank, files must be present in data_local
max_seq_len: 2048
global_seed: 17

# Run Name
run_name: # If left blank, will be read from env var $RUN_NAME

max_seq_len: ${variables.max_seq_len}
run_name: ${variables.run_name}

# Model
model:
name: mpt_causal_lm
init_device: meta
d_model: 768
n_heads: 12
n_layers: 12
expansion_ratio: 4
max_seq_len: ${variables.max_seq_len}
vocab_size: 50368
attn_config:
attn_impl: flash

# Tokenizer
tokenizer:
name: EleutherAI/gpt-neox-20b
kwargs:
model_max_length: ${variables.max_seq_len}

# Dataloaders
train_loader:
name: text
dataset:
local: ${variables.data_local}
remote: ${variables.data_remote}
split: train_small
shuffle: true
max_seq_len: ${variables.max_seq_len}
shuffle_seed: ${variables.global_seed}
drop_last: true
num_workers: 8

eval_loader:
name: text
dataset:
local: ${variables.data_local}
remote: ${variables.data_remote}
split: val_small
shuffle: false
max_seq_len: ${variables.max_seq_len}
shuffle_seed: ${variables.global_seed}
replication: 2
drop_last: false
num_workers: 8

# Optimization
scheduler:
name: cosine_with_warmup
t_warmup: 100ba
alpha_f: 0.1

optimizer:
name: decoupled_adamw
lr: 6.0e-4
betas:
- 0.9
- 0.95
eps: 1.0e-08
weight_decay: 0.0

algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0

max_duration: 100ba
eval_interval: 500ba
eval_first: false
eval_subset_num_batches: 0
global_train_batch_size: 256

# System
seed: ${variables.global_seed}
device_eval_batch_size: 16
device_train_microbatch_size: 16
# device_train_microbatch_size: auto
precision: amp_bf16

# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: false
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true

# TP
tp_config:
strategy: ffn
tensor_parallel_degree: 2

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}

loggers:
mlflow:
experiment_name: tp
# model_registry_prefix: datasets.eitanturok.${run_name}

# loggers:
# wandb: {}

# Checkpoint to local filesystem or remote object store
# save_interval: 500ba
# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
# save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints

# Load from local filesystem or remote object store
# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt

0 comments on commit 1474a80

Please sign in to comment.