-
Notifications
You must be signed in to change notification settings - Fork 283
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'deepspeed' into deepspeed_inference
- Loading branch information
Showing
8 changed files
with
359 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,32 @@ | ||
#!/bin/bash | ||
#SBATCH --nodes 1 | ||
#SBATCH --ntasks-per-node=6 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH --account=efml | ||
#SBATCH --partition=gpu | ||
#SBATCH --time=48:00:00 | ||
#SBATCH --job-name=flamingo | ||
|
||
export PYTHONFAULTHANDLER=1 | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export MASTER_PORT=15000 | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
export HF_DATASETS_CACHE="/gscratch/efml/anasa2/.huggingface" TRANSFORMERS_CACHE="/gscratch/efml/anasa2/.huggingface" | ||
|
||
export PYTHONPATH="$PYTHONPATH:open_flamingo" | ||
srun --cpu_bind=v --accel-bind=gn python | ||
|
||
|
||
|
||
deepspeed open_flamingo/open_flamingo/train/train.py \ | ||
--lm_path meta-llama/Llama-2-13b \ | ||
--tokenizer_path meta-llama/Llama-2-13b \ | ||
--cross_attn_every_n_layers 4 \ | ||
srun --cpu_bind=v --accel-bind=gn python open_flamingo/open_flamingo/train/train.py \ | ||
--lm_path anas-awadalla/mpt-1b-redpajama-200b \ | ||
--tokenizer_path anas-awadalla/mpt-1b-redpajama-200b \ | ||
--cross_attn_every_n_layers 1 \ | ||
--dataset_resampled \ | ||
--batch_size_mmc4 16 \ | ||
--batch_size_laion 32 \ | ||
--deepspeed \ | ||
--batch_size_mmc4 32 \ | ||
--batch_size_laion 64 \ | ||
--train_num_samples_mmc4 125000\ | ||
--train_num_samples_laion 250000 \ | ||
--loss_multiplier_laion 0.2 \ | ||
--workers=4 \ | ||
--run_name "deepspeed" \ | ||
--run_name OpenFlamingo-3B-vitl-mpt1b \ | ||
--num_epochs 480 \ | ||
--warmup_steps 0 \ | ||
--mmc4_textsim_threshold 0.0 \ | ||
--laion_shards "/mmfs1/gscratch/efml/anasa2/laion-samples/{000000..000001}.tar" \ | ||
--mmc4_shards "/mmfs1/gscratch/efml/anasa2/mmc4-samples/shard_{0..1}-000000000.tar" \ | ||
--warmup_steps 1875 \ | ||
--mmc4_textsim_threshold 0.24 \ | ||
--laion_shards "/path/to/shards/shard-{0000..0999}.tar" \ | ||
--mmc4_shards "/path/to/shards/shard-{0000..0999}.tar" \ | ||
--gradient_checkpointing \ | ||
--report_to_wandb \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.