forked from nianlonggu/WhisperSeg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_moan_other.sh
69 lines (58 loc) · 2.27 KB
/
evaluate_moan_other.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
# SLURM directives
#SBATCH --gres=gpu:RTX5000:1
#SBATCH --mem 64G
#SBATCH -c 8
#SBATCH -p gpu
#SBATCH -t 2-00:00:00
#SBATCH -o /usr/users/bhenne/projects/whisperseg/slurm_files/job-%J.out
# Definitions
base_dir="/usr/users/bhenne/projects/whisperseg"
model_name="20240702_012257_j537471_wseg-base"; cfg="3"
# model_name="20240702_012658_j537476_wseg-base"; cfg="4"
experiment_dir="labels_moan_other"
code_dir="$base_dir"
script="evaluate.py"
data_tar="$base_dir/data/lemur_tar/data_moan_other/lemur_data_cfg${cfg}_moan_other.tar"
label_tar="$base_dir/data/lemur_tar/labels_moan_other/lemur_labels_cfg${cfg}_moan_other.tar"
model_dir="$base_dir/model/$model_name/final_checkpoint_ct2"
output_dir="$base_dir/results"
output_identifier="base_j${SLURM_JOB_ID}_moan_other"
work_dir="/local/eckerlab/wseg_data"
job_dir="$work_dir/$(date +"%Y%m%d_%H%M%S")_${SLURM_JOB_ID}_${script%.*}"
# Prevents excessive GPU memory reservation by Torch; enables batch sizes > 1 on v100s
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# Function executes: on script exit, on error, on manual termination with ctrl-c
cleanup() {
if [ -z "$cleanup_done" ]; then # otherwise cleanup runs twice for SIGINT or ERR
cleanup_done=true
echo "[JOB] Cleaning up..."
# Clean up: remove data, "<time>_<id>_<job>/" directory and parent working directory, if empty
rm -rf "$job_dir"
if [ -z "$(ls -A "${job_dir%/*}")" ]; then
rmdir "${job_dir%/*}"
fi
unset PYTORCH_CUDA_ALLOC_CONF
fi
exit 1
}
# Trap SIGINT signal (Ctrl+C), ERR signal (error), and script termination
trap cleanup SIGINT ERR EXIT
# Prepare compute node environment
echo "[JOB] Preparing environment..."
module load anaconda3
source activate wseg
# Create temporary job directory and copy data
echo "[JOB] Moving data to cluster..."
mkdir -p "$job_dir"
# tarballs contain directory structure for pretrain/finetune/test split
tar -xf "$data_tar" -C "$job_dir"
tar -xf "$label_tar" -C "$job_dir"
# Pre-training, usually on multispecies wseg model
echo "[JOB] Evaluating checkpoint..."
python "$code_dir/$script" \
--dataset_path "$job_dir/test" \
--model_path "$model_dir" \
--output_dir "$output_dir" \
--identifier "$output_identifier"
# Clean up (already handled by trap)