Skip to content

Adjust the L3 perf test threshold for H100 runners #5606

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions qa/TL3_EfficientDet_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ pushd /opt/dali/docs/examples/use_cases/tensorflow/efficientdet
python -m pip install --upgrade pip
python -m pip install -r requirements.txt

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python train.py \
--epochs 1 \
Expand Down
17 changes: 10 additions & 7 deletions qa/TL3_EfficientNet_benchmark/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ fi

popd

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export PATH_TO_IMAGENET=/imagenet

export RESULT_WORKSPACE=./
Expand Down Expand Up @@ -77,12 +80,12 @@ python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 -
# grep "train.total_ips" <filename>.json | tail -1 | cut -c 5- | python3 -c "import sys, json; print(json.load(sys.stdin))"

# Actual results are about 500 samples/s more
SYNTH_THRESHOLD=10800
DALI_NONE_THRESHOLD=8900
DALI_AA_THRESHOLD=9000
DALI_TA_THRESHOLD=9000
PYTORCH_NONE_THRESHOLD=7000
PYTORCH_AA_THRESHOLD=6800
SYNTH_THRESHOLD=32000
DALI_NONE_THRESHOLD=27000
DALI_AA_THRESHOLD=26000
DALI_TA_THRESHOLD=26000
PYTORCH_NONE_THRESHOLD=23000
PYTORCH_AA_THRESHOLD=22000

function CHECK_PERF_THRESHOLD {
FILENAME=$1
Expand All @@ -106,7 +109,7 @@ CHECK_PERF_THRESHOLD "bench_report_pytorch.json" $PYTORCH_NONE_THRESHOLD
CHECK_PERF_THRESHOLD "bench_report_pytorch_aa.json" $PYTORCH_AA_THRESHOLD


# In the initial training we get siginificant increase in accuracy on the first few epochs,
# In the initial training we get significant increase in accuracy on the first few epochs,
# after 10 epochs we typically cross 50%.
# Do an additional run of DALI + AA for 10 epochs and check against 48 top1 accuracy (with some
# safety margin).
Expand Down
3 changes: 3 additions & 0 deletions qa/TL3_JAX_multiprocess/test_jax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ function CLEAN_AND_EXIT {
exit $1
}

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python -c "import jax; print(jax.devices()); assert jax.device_count() > 0"

echo "Test one GPU per process"
Expand Down
6 changes: 5 additions & 1 deletion qa/TL3_RN50_convergence/test_paddle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ LOG=dali.log

SECONDS=0
EPOCHS=25 # limiting to 25 epochs to save time

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export FLAGS_fraction_of_gpu_memory_to_use=.80
export FLAGS_apply_pass_to_program=1

Expand All @@ -48,7 +52,7 @@ fi

MIN_TOP1=.45 # would be 75% if we run 90 epochs
MIN_TOP5=.70 # would be 92% if we run 90 epochs
MIN_PERF=7000
MIN_PERF=27000

function PRINT_THRESHOLD {
FILENAME=$1
Expand Down
5 changes: 4 additions & 1 deletion qa/TL3_RN50_convergence/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ cd /opt/dali/docs/examples/use_cases/pytorch/resnet50

NUM_GPUS=$(nvidia-smi -L | wc -l)

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

if [ ! -d "val" ]; then
ln -sf /data/imagenet/val-jpeg/ val
fi
Expand All @@ -33,7 +36,7 @@ fi

MIN_TOP1=75.0
MIN_TOP5=92.0
MIN_PERF=5300
MIN_PERF=13000

TOP1=$(grep "^##Top-1" $LOG | awk '{print $2}')
TOP5=$(grep "^##Top-5" $LOG | awk '{print $2}')
Expand Down
6 changes: 5 additions & 1 deletion qa/TL3_RN50_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ OUT=${LOG%.log}.dir
mkdir -p $OUT

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"

mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
Expand All @@ -44,7 +48,7 @@ fi

MIN_TOP1=0.75
MIN_TOP5=0.92
MIN_PERF=7700
MIN_PERF=23000

TOP1=$(grep "loss:" $LOG | awk '{print $18}' | tail -1)
TOP5=$(grep "loss:" $LOG | awk '{print $21}' | tail -1)
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_paddle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ LOG=dali.log

SECONDS=0
EPOCHS=25 # limiting to 25 epochs to save time

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export FLAGS_fraction_of_gpu_memory_to_use=.80
python -m paddle.distributed.launch --selected_gpus $(echo $GPUS | tr ' ' ',') \
main.py -b 96 -j 4 --lr=0.3 --epochs ${EPOCHS} ./ 2>&1 | tee $LOG
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ fi
LOG=dali.log

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

torchrun --nproc_per_node=${NUM_GPUS} main.py -a resnet50 --b 256 --loss-scale 128.0 --workers 8 --lr=0.4 --fp16-mode --epochs 5 ./ 2>&1 | tee $LOG

RET=${PIPESTATUS[0]}
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_RN50_short/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ OUT=${LOG%.log}.dir
mkdir -p $OUT

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

export TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"

mpiexec --allow-run-as-root --bind-to none -np ${NUM_GPUS} \
Expand Down
4 changes: 4 additions & 0 deletions qa/TL3_SSD_convergence/test_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ NUM_GPUS=$(nvidia-smi -L | wc -l)
LOG=dali.log

SECONDS=0

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

# Prevent OOM due to fragmentation on 16G machines
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data /data/coco/coco-2017/coco2017/ --data_pipeline dali --target 0.25 2>&1 | tee $LOG
Expand Down
3 changes: 3 additions & 0 deletions qa/TL3_YOLO_convergence/test_tensorflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ apt update && apt install python3-opencv -y
python -m pip install --upgrade pip
python -m pip install -r requirements.txt

# turn off SHARP to avoid NCCL errors
export NCCL_NVLS_ENABLE=0

python src/main.py train \
/data/coco/coco-2017/coco2017/train2017 \
/data/coco/coco-2017/coco2017/annotations/instances_train2017.json \
Expand Down
Loading