-
Notifications
You must be signed in to change notification settings - Fork 0
/
densenet.lsf
70 lines (58 loc) · 1.91 KB
/
densenet.lsf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#BSUB -P CSC378
#BSUB -J mimiccxr_densenet
#BSUB -o output/job%J.o
#BSUB -e output/job%J.e
#BSUB -csm n
#BSUB -q killable
#BSUB -rn
#BSUB -W 0:20
#BSUB -nnodes 4
# determine number of compute nodes and their hostnames
COMPUTE_NODES=$(cat $LSB_DJOB_HOSTFILE | sort -u | grep -v batch | grep -v login)
NNODES=$(echo "$COMPUTE_NODES" | wc -l)
module load ibm-wml-ce/1.7.0-2
conda activate powerai170
# Be verbose but _after_ loading modules so we don't spam the logs
set -ex
# remove the batch node so we only include compute nodes
#NNODES=$(echo $LSB_HOSTS | tr ' ' '\n' | sort -u | grep -v batch | wc -l)
echo "Running job with ${NNODES} compute nodes"
# for saving model weights and so forth
export TORCH_HOME=$PROJWORK/csc378/4jh/torch_home
mkdir -p $TORCH_HOME
SEED=0
FOLD=0
JOBDIR=output/job${LSB_JOBID}
FOLDDIR=$JOBDIR/seed${SEED}/fold${FOLD}
mkdir -p $FOLDDIR
# Batch sizes that fit on various image sizes for densenet121:
# 256: 50 # JH 2020-05-11
# 512: 20 # JH 2020-05-11
# 1024: 6 # JH 2020-05-11
# 2048: 1 # JH 2020-05-11
BS=50
SZ=256
# LR should be in terms of batch size
LR=$(python -c "print($BS * 1e-6)")
# set master for torch.distributed
# See Junqi's example here:
# https://code.ornl.gov/olcf-analytics/summit/distributed-deep-learning-examples/-/blob/master/examples/pytorch/pytorch_synthetic_benchmark.py#L49
#
export MASTER_ADDR=$(echo "$COMPUTE_NODES" | head -1)
export MASTER_PORT=23456
echo "n${NNODES} lr${LR} ${SZ}^2 bs${BS}" > $JOBDIR/description
jsrun \
-n$((NNODES*6)) -r6 -a1 -g1 -c7 \
--bind=proportional-packed:7 --launch_distribution=packed \
-E MASTER_ADDR \
-E MASTER_PORT \
-E TORCH_HOME \
./mpienv.sh \
$(which python) train_densenet.py \
--outputdir $FOLDDIR \
--learning-rate $LR \
--hide-progress \
--image-subdir files${SZ}x${SZ} \
--distributed-data-parallel \
--batch-size $BS \
#--val-iters 200 \