diff --git a/scripts/train/benchmarking/act_ckpt_optimize.sh b/scripts/train/benchmarking/act_ckpt_optimize.sh index b05f22f728..52ff209e8a 100755 --- a/scripts/train/benchmarking/act_ckpt_optimize.sh +++ b/scripts/train/benchmarking/act_ckpt_optimize.sh @@ -1,19 +1,21 @@ #!/bin/bash -PROJECT="ackckpt" +PROJECT="ackckptqt" TORCH_2_IMAGE="mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04" CLUSTER_80GB=r1z1 -GIT_COMMIT=v0.2.0 +GIT_BRANCH=run_initial -for MB_SIZE in 1 2 4 8 +for MB_SIZE in 1 2 4 6 8 do - for GATH_LMT in true false - do - for CPU_OFFLOAD in true false - do - python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false - python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false - python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false - done - done -done \ No newline at end of file +# # for GATH_LMT in true false +# # do +# # for CPU_OFFLOAD in true false +# # do + python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_40gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size $MB_SIZE --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# # done +# # done +done + +# python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 12 --accum 2 --image $TORCH_2_IMAGE --git_branch $GIT_BRANCH --gpu_type a100_80gb -t bf16 --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_limit_all_gathers $GATH_LMT --activation_cpu_offload $CPU_OFFLOAD --fsdp_config_activation_checkpointing false \ No newline at end of file diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index 197aceab2e..6e1050cdf2 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -187,7 +187,7 @@ def parse_args(): parser.add_argument('--torch_compile_fullgraph', type=str_to_bool, default=None) parser.add_argument('--torch_compile_dynamic', type=str_to_bool, default=None) parser.add_argument('--torch_compile_mode', type=str, default=None) - + parser.add_argument('--torch_compile', type=str_to_bool, default=False) parser.add_argument('--RUN', type=str_to_bool, nargs='?', @@ -283,7 +283,8 @@ def mod_parameters(parameters: Dict[str, Any], pad_vocab_multiple: Optional[int] = None, torch_compile_fullgraph: Optional[bool] = None, torch_compile_dynamic: Optional[bool] = None, - torch_compile_mode: Optional[str] = None + torch_compile_mode: Optional[str] = None, + torch_compile: bool = False ): if run_name: parameters['run_name'] = run_name @@ -349,16 +350,14 @@ def mod_parameters(parameters: Dict[str, Any], parameters['fsdp_config']['backward_prefetch'] = fsdp_config_backward_prefetch if activation_cpu_offload is not None: parameters['fsdp_config']['activation_cpu_offload'] = activation_cpu_offload - parameters['fsdp_config']['verbose'] = True - - - parameters['compile_config'] = {} - if torch_compile_fullgraph is not None: - parameters['compile_config']['fullgraph'] = torch_compile_fullgraph - if torch_compile_dynamic is not None: - parameters['compile_config']['dynamic'] = torch_compile_dynamic - if torch_compile_mode is not None: - parameters['compile_config']['mode'] = torch_compile_mode + # parameters['fsdp_config']['verbose'] = True + parameters['compile_config'] = {} if torch_compile else None + # if torch_compile_fullgraph is not None: + # parameters['compile_config']['fullgraph'] = torch_compile_fullgraph + # if torch_compile_dynamic is not None: + # parameters['compile_config']['dynamic'] = torch_compile_dynamic + # if torch_compile_mode is not None: + # parameters['compile_config']['mode'] = torch_compile_mode if wandb: # add wandb @@ -471,8 +470,11 @@ def run_config(config: Tuple[str, int, int, str, str, int, str], pad_vocab_multiple=args.pad_vocab_multiple, torch_compile_fullgraph = args.torch_compile_fullgraph, torch_compile_dynamic = args.torch_compile_dynamic, - torch_compile_mode = args.torch_compile_mode + torch_compile_mode = args.torch_compile_mode, + torch_compile = args.torch_compile ) + if args.torch_compile: + assert(parameters['model']['attn_config']['attn_impl'] != 'triton') if gpu_type == 'h100_80gb' and precision == 'fp8': parameters['model']['fc_type'] = 'te' # Create run config mcli sdk/api diff --git a/scripts/train/benchmarking/sweep_fsdp.sh b/scripts/train/benchmarking/sweep_fsdp.sh index 51fa21db7a..358855cf98 100644 --- a/scripts/train/benchmarking/sweep_fsdp.sh +++ b/scripts/train/benchmarking/sweep_fsdp.sh @@ -25,71 +25,71 @@ python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_si python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 12 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 16 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 12 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 16 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 12 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN # Replicate/understand any diffs using streaming data loader -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing true -python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 12 --RUN --fsdp_config_activation_checkpointing true # 80GB Test microbatch size w/ no act_ckpt -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false - -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false - -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false + +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false # # Test ack_ckpt differences -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false - -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 6 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false # PASSED -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false # PASSED -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false # PASSED -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false - - -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false - -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 3 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false + +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 6 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false # PASSED +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false # PASSED +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false # PASSED +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false + + +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false + +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 1 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 10 10 --RUN --fsdp_config_activation_checkpointing false # NOTE: Tried the commented ones last night, OOM'd -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 14 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 12 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 10 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false # PASSED -# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 7 7 --RUN --data_remote oci://mosaicml-internal-dataset-c4/preconcat-gpt_neox/ --fsdp_config_activation_checkpointing false #PASSED \ No newline at end of file +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 14 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 12 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 10 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 8 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 4 --accum 4 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 2 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false # PASSED +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 8 --accum 16 --image $IMAGE --git_commit $GIT_COMMIT --gpu_type a100_40gb --cluster $CLUSTER_40GB -s 7 7 --RUN --fsdp_config_activation_checkpointing false #PASSED \ No newline at end of file diff --git a/scripts/train/benchmarking/torch_benchmarks_80gb.sh b/scripts/train/benchmarking/torch_benchmarks_80gb.sh new file mode 100755 index 0000000000..75f3f52aab --- /dev/null +++ b/scripts/train/benchmarking/torch_benchmarks_80gb.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +PROJECT="torches80gb" +TORCH_2_IMAGE="mosaicml/pytorch:2.0.1_cu118-python3.10-ubuntu20.04" +TORCH_113_IMAGE="mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04" +CLUSTER_80GB=r1z1 +CLUSTER_40GB=r8z3 +GIT_COMMIT=v0.2.0 +# 30b test Torch Runs +# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 12 13 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 6 --accum 21 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 1 --accum 21 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 12 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 3 --accum 21 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 6 --accum 21 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 10 10 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 30b.yaml -g 8 --microbatch_size 12 --accum 21 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +#13b test Torch runs -- seperate Torch1.13 and torch2 +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 3 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 80 --accum 3 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 1 --accum 3 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 5 --accum 3 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 20 --accum 3 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 13b.yaml -g 8 --microbatch_size 80 --accum 3 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +# #7b test torch runs +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 32 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 128 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 2 --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 8 --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 32 --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing true +# python submit_benchmarks.py --project $PROJECT -m 7b.yaml -g 8 --microbatch_size 128 --accum 2 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing true + +#3b test torch runs +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 6 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false + +#3b test torch runs +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing true +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 3 --accum 6 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 10 --accum 6 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 11 11 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 3b.yaml -g 8 --microbatch_size 40 --accum 6 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false + +#1b test torch runs +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 4 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false + +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 1 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 2 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 1b.yaml -g 8 --microbatch_size 56 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false + +#abbreviate it, 350m +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 4 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 13 13 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 6 --accum 2 --image $TORCH_113_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false + +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 1 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 15 15 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 2 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 14 14 --RUN --fsdp_config_activation_checkpointing false +python submit_benchmarks.py --project $PROJECT -m 350m.yaml -g 8 --microbatch_size 56 --accum 4 --image $TORCH_2_IMAGE --git_commit $GIT_COMMIT --gpu_type a100_80gb --cluster $CLUSTER_80GB -s 9 9 --RUN --fsdp_config_activation_checkpointing false