File tree Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Expand file tree Collapse file tree 1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change @@ -5,6 +5,7 @@ set -o errexit
5
5
set -o pipefail
6
6
7
7
function CLEAN_AND_EXIT {
8
+ (( IS_TMP_DIR)) && rm -rf ${DATA_DIR}
8
9
exit $1
9
10
}
10
11
@@ -42,8 +43,7 @@ export NCCL_NVLS_ENABLE=0
42
43
43
44
# Prevent OOM due to fragmentation on 16G machines
44
45
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:4096
45
- torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 256 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG
46
- (( IS_TMP_DIR)) && rm -rf ${DATA_DIR}
46
+ torchrun --nproc_per_node=${NUM_GPUS} main.py --backbone resnet50 --warmup 300 --bs 64 --eval-batch-size 8 --data /coco --data ${DATA_DIR} --data_pipeline dali --target 0.25 2>&1 | tee $LOG
47
47
48
48
RET=${PIPESTATUS[0]}
49
49
echo " Training ran in $SECONDS seconds"
You can’t perform that action at this time.
0 commit comments