-
Notifications
You must be signed in to change notification settings - Fork 5
/
smultinode_apptainer.sh
69 lines (61 loc) · 2.29 KB
/
smultinode_apptainer.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env bash
#sleep 30
#fi_info -p efa -t FI_EP_RDM
# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script
module restore
module load Apptainer
echo myuser=`whoami`
echo COUNT_NODE=$COUNT_NODE
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
echo PATH = $PATH
echo which mpicc `which mpicc`
echo HOSTNAMES = $HOSTNAMES
echo hostname = `hostname`
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT
H=`hostname`
THEID=`echo -e $HOSTNAMES | python -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
echo THEID=$THEID
echo SLURM_PROCID=$SLURM_PROCID
export NCCL_TIMEOUT=3600000
export NCCL_BLOCKING_WAIT=0
apptainer exec --nv \
-B $PWD:$PWD \
-B ${PWD}/src/llm_finetune:/app/src/llm_finetune \
-B ${PWD}/checkpoint:/app/checkpoint \
-B ${PWD}/deepspeed_config:/app/deepspeed_config \
-B ${PWD}/scripts:/app/scripts \
-B /project/lt900048-ai24tn/models:/project/lt900048-ai24tn/models \
./llm-finetune.sif \
accelerate launch \
--num_processes $(( 4 * $COUNT_NODE )) \
--num_machines $COUNT_NODE \
--multi_gpu \
--mixed_precision fp16 \
--machine_rank $SLURM_PROCID \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
scripts/train.py \
--model_name_or_path /project/lt900048-ai24tn/models/new5558/tinyllama_1b-200000 \
--train_data_path /app/example/sample_train_data.json \
--eval_data_path /app/example/sample_eval_data.json \
--data_seed 42 \
--model_max_length 2048 \
--bf16 True \
--output_dir /app/checkpoint/ \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--save_strategy "steps" \
--save_steps 700 \
--save_total_limit 5 \
--logging_strategy 'steps' \
--logging_steps 1 \
--logging_first_step True \
--learning_rate 8e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--deepspeed /app/deepspeed_config/deepspeed_3.json \
--gradient_checkpointing True \
--tf32 True \
# --checkpoint ...