Skip to content

Commit

Permalink
modify script
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Rinard committed Aug 23, 2023
1 parent e857cfc commit fdfa2fa
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 86 deletions.
28 changes: 15 additions & 13 deletions scripts/train/benchmarking/act_ckpt_optimize.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
#!/bin/bash

PROJECT="ackckpt"
PROJECT="ackckptqt"
TORCH_2_IMAGE="mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04"
CLUSTER_80GB=r1z1
GIT_COMMIT=v0.2.0
GIT_BRANCH=run_initial

for MB_SIZE in 1 2 4 8
for MB_SIZE in 1 2 4 6 8
do
for GATH_LMT in true false
do
for CPU_OFFLOAD in true false
do
python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false
python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false
python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false
done
done
done
# # for GATH_LMT in true false
# # do
# # for CPU_OFFLOAD in true false
# # do
python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_40gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false
python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false
python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false
# # done
# # done
done

# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 12 --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false
28 changes: 15 additions & 13 deletions scripts/train/benchmarking/submit_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def parse_args():
parser.add_argument('--torch_compile_fullgraph', type=str_to_bool, default=None)
parser.add_argument('--torch_compile_dynamic', type=str_to_bool, default=None)
parser.add_argument('--torch_compile_mode', type=str, default=None)

parser.add_argument('--torch_compile', type=str_to_bool, default=False)
parser.add_argument('--RUN',
type=str_to_bool,
nargs='?',
Expand Down Expand Up @@ -283,7 +283,8 @@ def mod_parameters(parameters: Dict[str, Any],
pad_vocab_multiple: Optional[int] = None,
torch_compile_fullgraph: Optional[bool] = None,
torch_compile_dynamic: Optional[bool] = None,
torch_compile_mode: Optional[str] = None
torch_compile_mode: Optional[str] = None,
torch_compile: bool = False
):
if run_name:
parameters['run_name'] = run_name
Expand Down Expand Up @@ -349,16 +350,14 @@ def mod_parameters(parameters: Dict[str, Any],
parameters['fsdp_config']['backward_prefetch'] = fsdp_config_backward_prefetch
if activation_cpu_offload is not None:
parameters['fsdp_config']['activation_cpu_offload'] = activation_cpu_offload
parameters['fsdp_config']['verbose'] = True


parameters['compile_config'] = {}
if torch_compile_fullgraph is not None:
parameters['compile_config']['fullgraph'] = torch_compile_fullgraph
if torch_compile_dynamic is not None:
parameters['compile_config']['dynamic'] = torch_compile_dynamic
if torch_compile_mode is not None:
parameters['compile_config']['mode'] = torch_compile_mode
# parameters['fsdp_config']['verbose'] = True
parameters['compile_config'] = {} if torch_compile else None
# if torch_compile_fullgraph is not None:
# parameters['compile_config']['fullgraph'] = torch_compile_fullgraph
# if torch_compile_dynamic is not None:
# parameters['compile_config']['dynamic'] = torch_compile_dynamic
# if torch_compile_mode is not None:
# parameters['compile_config']['mode'] = torch_compile_mode

if wandb:
# add wandb
Expand Down Expand Up @@ -471,8 +470,11 @@ def run_config(config: Tuple[str, int, int, str, str, int, str],
pad_vocab_multiple=args.pad_vocab_multiple,
torch_compile_fullgraph = args.torch_compile_fullgraph,
torch_compile_dynamic = args.torch_compile_dynamic,
torch_compile_mode = args.torch_compile_mode
torch_compile_mode = args.torch_compile_mode,
torch_compile = args.torch_compile
)
if args.torch_compile:
assert(parameters['model']['attn_config']['attn_impl'] != 'triton')
if gpu_type == 'h100_80gb' and precision == 'fp8':
parameters['model']['fc_type'] = 'te'
# Create run config mcli sdk/api
Expand Down
Loading

0 comments on commit fdfa2fa

Please sign in to comment.