From 2af4051d5a3f47652515bd8b2439607d81064be3 Mon Sep 17 00:00:00 2001
From: xysheng-baidu <xuyongsheng@baidu.com>
Date: Wed, 29 Mar 2023 14:55:50 +0800
Subject: [PATCH] add N4C32 shell for PETRV2

---
 .../scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh  | 17 +++++++++++++++++
 .../scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh  | 17 +++++++++++++++++
 .../petrv2/benchmark_common/analysis_log.py     |  1 +
 .../petrv2/benchmark_common/run_benchmark.sh    | 15 +++++++++++++--
 4 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh
 create mode 100644 frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh

diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh
new file mode 100644
index 0000000000..f42e28b750
--- /dev/null
+++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh
@@ -0,0 +1,17 @@
+model_item=petrv2
+bs_item=1
+fp_item=fp16
+run_process_type=MultiP
+run_mode=DP
+device_num=N4C32
+max_iter=-1
+num_workers=32
+
+node_num=${PADDLE_TRAINERS_NUM}
+node_rank=${PADDLE_TRAINER_ID}
+master_addr=${POD_0_IP}
+master_port=14333
+
+sed -i '/set\ -xe/d' run_benchmark.sh
+bash PrepareEnv.sh ${model_item};
+bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} ${node_num} ${node_rank} ${master_addr} ${master_port} 2>&1;
diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh
new file mode 100644
index 0000000000..89a4deb976
--- /dev/null
+++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh
@@ -0,0 +1,17 @@
+model_item=petrv2
+bs_item=1
+fp_item=fp32
+run_process_type=MultiP
+run_mode=DP
+device_num=N4C32
+max_iter=-1
+num_workers=32
+
+node_num=${PADDLE_TRAINERS_NUM}
+node_rank=${PADDLE_TRAINER_ID}
+master_addr=${POD_0_IP}
+master_port=14333
+
+sed -i '/set\ -xe/d' run_benchmark.sh
+bash PrepareEnv.sh ${model_item};
+bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} ${node_num} ${node_rank} ${master_addr} ${master_port} 2>&1;
diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py
index a94d20abbf..96ca5602cb 100644
--- a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py
+++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py
@@ -55,6 +55,7 @@ def analyze(model_name, batch_size, log_file, res_log_file, device_num):
                 "frame_version": os.getenv('frame_version'),
         }
     json_info = json.dumps(info)
+    print(json_info)
     with open(res_log_file, "w") as of:
         of.write(json_info)
 
diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh
index c3336bebc9..7fdeb485a0 100644
--- a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh
+++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh
@@ -17,6 +17,13 @@ function _set_params(){
     max_iter=${6:-"100"}                # （可选）需保证模型执行时间在5分钟内，需要修改代码提前中断的直接提PR 合入套件  或是max_epoch
     num_workers=${7:-"4"}               # (可选)
 
+    # Added for distributed training
+    node_num=${8:-"2"}                      #（可选） 节点数量
+    node_rank=${9:-"0"}                    # (可选)  节点rank
+    master_addr=${10:-"127.0.0.1"}       # (可选) 主节点ip地址
+    master_port=${11:-"1928"}               # (可选) 主节点端口号
+    # Added for distributed training
+
     #   以下为通用拼接log路径，无特殊可不用修改
     model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode}  # (必填) 切格式不要改动,与平台页面展示对齐
     device=${CUDA_VISIBLE_DEVICES//,/ }
@@ -41,6 +48,7 @@ function _analysis_log(){
     python analysis_log.py ${model_item} ${base_batch_size} ${log_file} ${speed_log_file} ${device_num}
 }
 
+    #N4C32) train_cmd="PYTHONPATH="$(dirname $0)/..":$PYTHONPATH ; \
 function _train(){
     batch_size=${base_batch_size}
     echo "current ${model_name} CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=${device_num}, batch_size=${batch_size}"
@@ -50,7 +58,10 @@ function _train(){
     case ${device_num} in
     N1C1) train_cmd="./tools/dist_train.sh ${train_config} 1 --work-dir ${train_options}" ;;
     N1C8) train_cmd="./tools/dist_train.sh ${train_config} 8 --work-dir ${train_options}" ;;
-    *) echo "choose device_num(N1C1, N1C8)"; exit 1;
+    N4C32) train_cmd="python3 -m torch.distributed.launch --nproc_per_node=8 --nnodes=$node_num \
+    --node_rank=$node_rank --master_addr=$master_addr --master_port=$master_port \
+    $(dirname "$0")/tools/train.py ${train_config} --launcher pytorch  --work-dir ${train_options}" ;;
+    *) echo "choose device_num(N1C1, N1C8, N4C32)"; exit 1;
     esac
 
     timeout 5m ${train_cmd} > ${log_file} 2>&1
@@ -75,4 +86,4 @@ job_bt=`date '+%Y%m%d%H%M%S'`
 _train
 job_et=`date '+%Y%m%d%H%M%S'`
 export model_run_time=$((${job_et}-${job_bt}))
-_analysis_log
\ No newline at end of file
+_analysis_log