From 2af4051d5a3f47652515bd8b2439607d81064be3 Mon Sep 17 00:00:00 2001 From: xysheng-baidu Date: Wed, 29 Mar 2023 14:55:50 +0800 Subject: [PATCH] add N4C32 shell for PETRV2 --- .../scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh | 17 +++++++++++++++++ .../scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh | 17 +++++++++++++++++ .../petrv2/benchmark_common/analysis_log.py | 1 + .../petrv2/benchmark_common/run_benchmark.sh | 15 +++++++++++++-- 4 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh create mode 100644 frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh new file mode 100644 index 0000000000..f42e28b750 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp16_DP.sh @@ -0,0 +1,17 @@ +model_item=petrv2 +bs_item=1 +fp_item=fp16 +run_process_type=MultiP +run_mode=DP +device_num=N4C32 +max_iter=-1 +num_workers=32 + +node_num=${PADDLE_TRAINERS_NUM} +node_rank=${PADDLE_TRAINER_ID} +master_addr=${POD_0_IP} +master_port=14333 + +sed -i '/set\ -xe/d' run_benchmark.sh +bash PrepareEnv.sh ${model_item}; +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} ${node_num} ${node_rank} ${master_addr} ${master_port} 2>&1; diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh new file mode 100644 index 0000000000..89a4deb976 --- /dev/null +++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/N4C32/petrv2_bs1_fp32_DP.sh @@ -0,0 +1,17 @@ +model_item=petrv2 +bs_item=1 +fp_item=fp32 +run_process_type=MultiP +run_mode=DP +device_num=N4C32 +max_iter=-1 +num_workers=32 + +node_num=${PADDLE_TRAINERS_NUM} +node_rank=${PADDLE_TRAINER_ID} +master_addr=${POD_0_IP} +master_port=14333 + +sed -i '/set\ -xe/d' run_benchmark.sh +bash PrepareEnv.sh ${model_item}; +bash run_benchmark.sh ${model_item} ${bs_item} ${fp_item} ${run_mode} ${device_num} ${max_iter} ${num_workers} ${node_num} ${node_rank} ${master_addr} ${master_port} 2>&1; diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py index a94d20abbf..96ca5602cb 100644 --- a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py +++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/analysis_log.py @@ -55,6 +55,7 @@ def analyze(model_name, batch_size, log_file, res_log_file, device_num): "frame_version": os.getenv('frame_version'), } json_info = json.dumps(info) + print(json_info) with open(res_log_file, "w") as of: of.write(json_info) diff --git a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh index c3336bebc9..7fdeb485a0 100644 --- a/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh +++ b/frame_benchmark/pytorch/dynamic/Paddle3D/scripts/petrv2/benchmark_common/run_benchmark.sh @@ -17,6 +17,13 @@ function _set_params(){ max_iter=${6:-"100"} # (可选)需保证模型执行时间在5分钟内,需要修改代码提前中断的直接提PR 合入套件 或是max_epoch num_workers=${7:-"4"} # (可选) + # Added for distributed training + node_num=${8:-"2"} #(可选) 节点数量 + node_rank=${9:-"0"} # (可选) 节点rank + master_addr=${10:-"127.0.0.1"} # (可选) 主节点ip地址 + master_port=${11:-"1928"} # (可选) 主节点端口号 + # Added for distributed training + # 以下为通用拼接log路径,无特殊可不用修改 model_name=${model_item}_bs${base_batch_size}_${fp_item}_${run_mode} # (必填) 切格式不要改动,与平台页面展示对齐 device=${CUDA_VISIBLE_DEVICES//,/ } @@ -41,6 +48,7 @@ function _analysis_log(){ python analysis_log.py ${model_item} ${base_batch_size} ${log_file} ${speed_log_file} ${device_num} } + #N4C32) train_cmd="PYTHONPATH="$(dirname $0)/..":$PYTHONPATH ; \ function _train(){ batch_size=${base_batch_size} echo "current ${model_name} CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=${device_num}, batch_size=${batch_size}" @@ -50,7 +58,10 @@ function _train(){ case ${device_num} in N1C1) train_cmd="./tools/dist_train.sh ${train_config} 1 --work-dir ${train_options}" ;; N1C8) train_cmd="./tools/dist_train.sh ${train_config} 8 --work-dir ${train_options}" ;; - *) echo "choose device_num(N1C1, N1C8)"; exit 1; + N4C32) train_cmd="python3 -m torch.distributed.launch --nproc_per_node=8 --nnodes=$node_num \ + --node_rank=$node_rank --master_addr=$master_addr --master_port=$master_port \ + $(dirname "$0")/tools/train.py ${train_config} --launcher pytorch --work-dir ${train_options}" ;; + *) echo "choose device_num(N1C1, N1C8, N4C32)"; exit 1; esac timeout 5m ${train_cmd} > ${log_file} 2>&1 @@ -75,4 +86,4 @@ job_bt=`date '+%Y%m%d%H%M%S'` _train job_et=`date '+%Y%m%d%H%M%S'` export model_run_time=$((${job_et}-${job_bt})) -_analysis_log \ No newline at end of file +_analysis_log