forked from nianlonggu/WhisperSeg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_base.sh
125 lines (110 loc) · 3.96 KB
/
train_base.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
# SLURM directives
#SBATCH --gres=gpu:RTX5000:1
#SBATCH --mem 64G
#SBATCH -c 8
#SBATCH -p gpu
#SBATCH -t 2-00:00:00
#SBATCH -o /usr/users/bhenne/projects/whisperseg/slurm_files/job-%J.out
# Check if the number of arguments passed is not exactly 1 or if config is empty
if [ "$#" -ne 1 ] || [ -z "$1" ]; then
echo "Usage: $0 <config_set> ([1-7], determines which config of data is used)"
echo "Error: Config-set argument is empty or missing."
exit 1
fi
# Definitions
cfg="$1"
base_dir="/usr/users/bhenne/projects/whisperseg"
code_dir="$base_dir"
experiment_dir="labels_baseline"
script1="train.py"
script2="evaluate.py"
data_tar="$base_dir/data/lemur_tar/lemur_data_cfg${cfg}.tar"
label_tar="$base_dir/data/lemur_tar/$experiment_dir/lemur_labels_cfg${cfg}.tar"
model_dir_in="nccratliri/whisperseg-base-animal-vad"
model_dir_out="$base_dir/model/$(date +"%Y%m%d_%H%M%S")_j${SLURM_JOB_ID}_wseg-base"
output_dir="$base_dir/results"
output_identifier="base_j${SLURM_JOB_ID}"
work_dir="/local/eckerlab/wseg_data"
job_dir="$work_dir/$(date +"%Y%m%d_%H%M%S")_${SLURM_JOB_ID}_${script1%.*}"
wandb_dir=$job_dir
# Model hyperparameter
project_name="wseg-lemur-results"
epochs=100
patience=10
val_ratio=0.2
wandb_notes="baseline cfg${cfg}, rtx5000:1, ep${epochs}, vratio${val_ratio}, pat${patience}"
# Prevents excessive GPU memory reservation by Torch; enables batch sizes > 1 on v100s
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Function executes: on script exit, on error, on manual termination with ctrl-c
cleanup() {
if [ -z "$cleanup_done" ]; then # otherwise cleanup runs twice for SIGINT or ERR
cleanup_done=true
echo "[JOB] Cleaning up..."
# Clean up: remove data, "<time>_<id>_<job>/" directory and parent working directory, if empty
rm -rf "$job_dir"
if [ -z "$(ls -A "${job_dir%/*}")" ]; then
rmdir "${job_dir%/*}"
fi
unset PYTORCH_CUDA_ALLOC_CONF
fi
exit 1
}
# Trap SIGINT signal (Ctrl+C), ERR signal (error), and script termination
trap cleanup SIGINT ERR EXIT
# Prepare compute node environment
echo "[JOB] Preparing environment..."
gpus=$(echo $CUDA_VISIBLE_DEVICES | tr ',' ' ')
module load anaconda3
source activate wseg
# Create temporary job directory and copy data
echo "[JOB] Moving data to cluster..."
mkdir -p "$job_dir"/{pretrain_ckpt,finetune_ckpt,wandb} # $job_dir itself + 3 others
# tarballs contain directory structure for pretrain/finetune/test split
tar -xf "$data_tar" -C "$job_dir"
tar -xf "$label_tar" -C "$job_dir"
# Pre-training, usually on multispecies wseg model
echo "[JOB] Pretraining..."
python "$code_dir/$script1" \
--initial_model_path "$model_dir_in" \
--train_dataset_folder "$job_dir/pretrain" \
--model_folder "$job_dir/pretrain_ckpt" \
--gpu_list $gpus \
--max_num_epochs $epochs \
--project $project_name \
--run_name $SLURM_JOB_ID-0 \
--run_notes "$wandb_notes" \
--wandb_dir "$wandb_dir" \
--validate_per_epoch 1 \
--val_ratio $val_ratio \
--save_per_epoch 1 \
--patience $patience
# Fine-tuning
echo "[JOB] Finetuning..."
python "$code_dir/$script1" \
--initial_model_path "$job_dir/pretrain_ckpt/final_checkpoint" \
--train_dataset_folder "$job_dir/finetune" \
--model_folder "$job_dir/finetune_ckpt" \
--gpu_list $gpus \
--max_num_epochs $epochs \
--project $project_name \
--run_name $SLURM_JOB_ID-1 \
--run_notes "$wandb_notes" \
--wandb_dir "$wandb_dir" \
--validate_per_epoch 1 \
--val_ratio $val_ratio \
--save_per_epoch 1 \
--patience $patience
# Evaluation
echo "[JOB] Evaluating..."
python "$code_dir/$script2" \
-d "$job_dir/test" \
-m "$job_dir/finetune_ckpt/final_checkpoint_ct2" \
-o "$output_dir" \
-i "$output_identifier"
# Move finished model to target job_dir
if [ -n "$(ls -A "$job_dir/finetune_ckpt")" ]; then
echo "[JOB] Moving trained model..."
mv "$job_dir/finetune_ckpt" "$model_dir_out"
fi
# Clean up (already handled by trap)