Skip to content

Commit a9745a7

Browse files
authored
Revert change of batch size in SSD LT3 to 64 due to convergence problem (#5846)
Signed-off-by: Janusz Lisiecki <[email protected]>
1 parent acb06c2 commit a9745a7

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

qa/TL3_SSD_convergence/test_pytorch.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ set -o errexit
55
set -o pipefail
66

77
function CLEAN_AND_EXIT {
8+
((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
89
exit $1
910
}
1011

@@ -42,8 +43,7 @@ export NCCL_NVLS_ENABLE=0
4243

4344
# Prevent OOM due to fragmentation on 16G machines
4445
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
45-
torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG
46-
((IS_TMP_DIR)) && rm -rf ${DATA_DIR}
46+
torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG
4747

4848
RET=${PIPESTATUS[0]}
4949
echo "Training ran in $SECONDS seconds"

0 commit comments

Comments
 (0)