Skip to content

Commit

Permalink
Adjust the L3 perf test threshold for H100 runners (#5606)
Browse files Browse the repository at this point in the history
- adjusts RN50 and EfficientNet perf threshold for L3 tests
- turn off SHARP for L3 tests

Signed-off-by: Janusz Lisiecki <[email protected]>
  • Loading branch information
JanuszL authored Aug 23, 2024
1 parent 594a218 commit 65d8d8b
Show file tree
Hide file tree
Showing 11 changed files with 48 additions and 10 deletions.
2 changes: 2 additions & 0 deletions qa/TL3_EfficientDet_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ pushd /opt/dali/docs/examples/use_cases/tensorflow/efficientdet
python -m pip install --upgrade pip
python -m pip install -r requirements.txt

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python train.py \
--epochs 1 \
Expand Down
17 changes: 10 additions & 7 deletions qa/TL3_EfficientNet_benchmark/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ fi

popd

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export PATH_TO_IMAGENET=/imagenet

export RESULT_WORKSPACE=./
Expand Down Expand Up @@ -77,12 +80,12 @@ python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 -
# grep "train.total_ips" <filename>.json | tail -1 | cut -c 5- | python3 -c "import sys, json; print(json.load(sys.stdin))"

# Actual results are about 500 samples/s more
SYNTH_THRESHOLD=10800
DALI_NONE_THRESHOLD=8900
DALI_AA_THRESHOLD=9000
DALI_TA_THRESHOLD=9000
PYTORCH_NONE_THRESHOLD=7000
PYTORCH_AA_THRESHOLD=6800
SYNTH_THRESHOLD=32000
DALI_NONE_THRESHOLD=27000
DALI_AA_THRESHOLD=26000
DALI_TA_THRESHOLD=26000
PYTORCH_NONE_THRESHOLD=23000
PYTORCH_AA_THRESHOLD=22000

function CHECK_PERF_THRESHOLD {
FILENAME=$1
Expand All @@ -106,7 +109,7 @@ CHECK_PERF_THRESHOLD "bench_report_pytorch.json" $PYTORCH_NONE_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_pytorch_aa.json" $PYTORCH_AA_THRESHOLD


# In the initial training we get siginificant increase in accuracy on the first few epochs,
# In the initial training we get significant increase in accuracy on the first few epochs,
# after 10 epochs we typically cross 50%.
# Do an additional run of DALI + AA for 10 epochs and check against 48 top1 accuracy (with some
# safety margin).
Expand Down
3 changes: 3 additions & 0 deletions qa/TL3_JAX_multiprocess/test_jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ function CLEAN_AND_EXIT {
exit $1
}

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python -c "import jax; print(jax.devices()); assert jax.device_count() > 0"

echo "Test one GPU per process"
Expand Down
6 changes: 5 additions & 1 deletion qa/TL3_RN50_convergence/test_paddle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ LOG=dali.log

SECONDS=0
EPOCHS=25 # limiting to 25 epochs to save time

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export FLAGS_fraction_of_gpu_memory_to_use=.80
export FLAGS_apply_pass_to_program=1

Expand All @@ -48,7 +52,7 @@ fi

MIN_TOP1=.45 # would be 75% if we run 90 epochs
MIN_TOP5=.70 # would be 92% if we run 90 epochs
MIN_PERF=7000
MIN_PERF=27000

function PRINT_THRESHOLD {
FILENAME=$1
Expand Down
5 changes: 4 additions & 1 deletion qa/TL3_RN50_convergence/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50

NUM_GPUS=$(nvidia-smi -L | wc -l)

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

if [ ! -d "val" ]; then
ln -sf /data/imagenet/val-jpeg/ val
fi
Expand All @@ -33,7 +36,7 @@ fi

MIN_TOP1=75.0
MIN_TOP5=92.0
MIN_PERF=5300
MIN_PERF=13000

TOP1=$(grep "^##Top-1" $LOG | awk '{print $2}')
TOP5=$(grep "^##Top-5" $LOG | awk '{print $2}')
Expand Down
6 changes: 5 additions & 1 deletion qa/TL3_RN50_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ OUT=${LOG%.log}.dir
mkdir -p $OUT

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"

mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
Expand All @@ -44,7 +48,7 @@ fi

MIN_TOP1=0.75
MIN_TOP5=0.92
MIN_PERF=7700
MIN_PERF=23000

TOP1=$(grep "loss:" $LOG | awk '{print $18}' | tail -1)
TOP5=$(grep "loss:" $LOG | awk '{print $21}' | tail -1)
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_paddle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ LOG=dali.log

SECONDS=0
EPOCHS=25 # limiting to 25 epochs to save time

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export FLAGS_fraction_of_gpu_memory_to_use=.80
python -m paddle.distributed.launch --selected_gpus $(echo $GPUS | tr ' ' ',') \
main.py -b 96 -j 4 --lr=0.3 --epochs ${EPOCHS} ./ 2>&1 | tee $LOG
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ fi
LOG=dali.log

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG

RET=${PIPESTATUS[0]}
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ OUT=${LOG%.log}.dir
mkdir -p $OUT

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"

mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_SSD_convergence/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
LOG=dali.log

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

# Prevent OOM due to fragmentation on 16G machines
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.25 2>&1 | tee $LOG
Expand Down
3 changes: 3 additions & 0 deletions qa/TL3_YOLO_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ apt update && apt install python3-opencv -y
python -m pip install --upgrade pip
python -m pip install -r requirements.txt

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python src/main.py train \
/data/coco/coco-2017/coco2017/train2017 \
/data/coco/coco-2017/coco2017/annotations/instances_train2017.json \
Expand Down

0 comments on commit 65d8d8b

Please sign in to comment.