PaddlePaddle · pangyoki · Jan 5, 2022 · Jan 6, 2022 · Jan 24, 2022
diff --git a/api/common/launch.py b/api/common/launch.py
@@ -168,7 +168,143 @@ def _parse_gpu_time(self, line):
         return gpu_time / percent
 
 
-def launch(benchmark_script, benchmark_script_args, with_nvprof=False):
+class NsightRunnerForDynamicScheduling(object):
+    def run(self, cmd, op_type, nvprof_start_step, nvprof_end_step, backward):
+        stdout, exit_code = self._nsight_for_dynamic_scheduling(cmd)
+        if exit_code == 0:
+            parse_status, scheduling_time_dict = self._parse_logs(
+                stdout.split("\n"), op_type, nvprof_start_step,
+                nvprof_end_step, backward)
+            if parse_status:
+                return scheduling_time_dict
+        print("Running Error:\n {}".format(stdout))
+        return {}
+
+    def _nsight_for_dynamic_scheduling(self, cmd):
+        return system.run_command(
+            "nsys profile -t cuda,nvtx --stats true -o tmp.qdrep --force-overwrite true {}".
+            format(cmd))
+
+    def _to_float(self, s):
+        return float(s.replace(',', ''))
+
+    def _calculate_avg_time(self, l):
+        total_time = self._to_float(l[1])
+        max_time = self._to_float(l[5])
+        calls = self._to_float(l[2]) - 1
+        return (total_time - max_time) / calls
+
+    def _parse_logs(self, logs, op_type, nvprof_start_step, nvprof_end_step,
+                    backward):
+        # print("yoki", logs)
+        flag_nvtx_time = False
+        total_step_time = 0.0
+        step_count = 0
+        parse_status = False
+
+        # 0: imperative (imperative_avg_time)
+        # 1: op_type (fwd_trace_op_avg_time)
+        # 2: op_type compute (fwd_op_compute_avg_time)
+        # 3: op_type_grad (bwd_trace_op_avg_time)
+        # 4: op_type_grad compute (bwd_op_compute_avg_time)
+        _scheduling_list = [
+            'imperative', op_type, op_type + ' compute', op_type + '_grad',
+            op_type + '_grad compute'
+        ]
+        nvtx_meta_data_dict = {}
+        scheduling_time_dict = {}
+
+        for i in range(len(logs)):
+            line = api_param.parse_string(logs[i])
+            if flag_nvtx_time:
+                infos = line.strip().split()
+                if not infos:
+                    continue
+                nvtx_range_type = infos[-1]
+                if nvtx_range_type == 'compute' or nvtx_range_type == 'infer_shape':
+                    nvtx_range_type = infos[-2] + ' ' + nvtx_range_type
+
+                # step time
+                if nvtx_range_type.isdigit() and int(
+                        nvtx_range_type) > nvprof_start_step and int(
+                            nvtx_range_type) < nvprof_end_step:
+                    step_count += 1
+                    step_time = self._to_float(infos[1])
+                    total_step_time += step_time
+
+                if nvtx_range_type in _scheduling_list:
+                    avg_time = self._calculate_avg_time(infos)
+                    nvtx_meta_data_dict[nvtx_range_type] = avg_time
+                    # print(nvtx_range_type + ' time: ', avg_time)
+
+            if 'NVTX Push-Pop Range Statistics:' in line:
+                flag_nvtx_time = True
+        if step_count != 0:
+            nvtx_meta_data_dict['step'] = total_step_time / step_count
+        # print("num_step: ", step_count, "  step_avg_time: ", total_step_time / step_count)
+
+        scheduling_time_dict['step_avg_time'] = nvtx_meta_data_dict[
+            'step'] if 'step' in nvtx_meta_data_dict else None
+        scheduling_time_dict['imperative_avg_time'] = nvtx_meta_data_dict[
+            _scheduling_list[0]] if _scheduling_list[
+                0] in nvtx_meta_data_dict else None
+        scheduling_time_dict['fwd_trace_op_avg_time'] = nvtx_meta_data_dict[
+            _scheduling_list[1]] if _scheduling_list[
+                1] in nvtx_meta_data_dict else None
+        scheduling_time_dict['fwd_op_compute_avg_time'] = nvtx_meta_data_dict[
+            _scheduling_list[2]] if _scheduling_list[
+                2] in nvtx_meta_data_dict else None
+        scheduling_time_dict['bwd_trace_op_avg_time'] = nvtx_meta_data_dict[
+            _scheduling_list[3]] if _scheduling_list[
+                3] in nvtx_meta_data_dict else None
+        scheduling_time_dict['bwd_op_compute_avg_time'] = nvtx_meta_data_dict[
+            _scheduling_list[4]] if _scheduling_list[
+                4] in nvtx_meta_data_dict else None
+        if scheduling_time_dict['step_avg_time'] and scheduling_time_dict[
+                'imperative_avg_time']:
+            if not backward:
+                scheduling_time_dict[
+                    'python_call_time'] = scheduling_time_dict[
+                        'step_avg_time'] - scheduling_time_dict[
+                            'imperative_avg_time']
+            elif scheduling_time_dict['bwd_trace_op_avg_time']:
+                scheduling_time_dict[
+                    'python_call_time'] = scheduling_time_dict[
+                        'step_avg_time'] - scheduling_time_dict[
+                            'imperative_avg_time'] - scheduling_time_dict[
+                                'bwd_trace_op_avg_time']
+        if scheduling_time_dict[
+                'imperative_avg_time'] and scheduling_time_dict[
+                    'fwd_trace_op_avg_time']:
+            scheduling_time_dict[
+                'imperative_call_time'] = scheduling_time_dict[
+                    'imperative_avg_time'] - scheduling_time_dict[
+                        'fwd_trace_op_avg_time']
+        if scheduling_time_dict[
+                'fwd_trace_op_avg_time'] and scheduling_time_dict[
+                    'fwd_op_compute_avg_time']:
+            scheduling_time_dict[
+                'fwd_trace_op_call_time'] = scheduling_time_dict[
+                    'fwd_trace_op_avg_time'] - scheduling_time_dict[
+                        'fwd_op_compute_avg_time']
+        if scheduling_time_dict[
+                'bwd_trace_op_avg_time'] and scheduling_time_dict[
+                    'bwd_op_compute_avg_time']:
+            scheduling_time_dict[
+                'bwd_trace_op_call_time'] = scheduling_time_dict[
+                    'bwd_trace_op_avg_time'] - scheduling_time_dict[
+                        'bwd_op_compute_avg_time']
+
+        parse_status = True
+
+        print(scheduling_time_dict)
+        return parse_status, scheduling_time_dict
+
+
+def launch(benchmark_script,
+           benchmark_script_args,
+           with_nvprof=False,
+           with_dynamic_scheduling=False):
     """
     If with_nvprof is True, it will launch the following command firstly to
     get the gpu_time:
@@ -188,10 +324,23 @@ def _set_profiler(args, value):
             args.append("--profiler")
             args.append(value)
 
+    def _split_arg_str_value(cmd, arg_name):
+        if arg_name not in cmd:
+            return None
+        return cmd.split("--" + arg_name)[1].strip().split()[0]
+
     if with_nvprof:
         _set_profiler(benchmark_script_args, "nvprof")
     cmd = "{} {} {}".format(sys.executable, benchmark_script,
                             " ".join(benchmark_script_args))
+    if with_dynamic_scheduling:
+        runner = NsightRunnerForDynamicScheduling()
+        nvprof_start_step = int(_split_arg_str_value(cmd, "nvprof_start_step"))
+        nvprof_end_step = int(_split_arg_str_value(cmd, "nvprof_end_step"))
+        op_type = _split_arg_str_value(cmd, "api_name")
+        backward = bool(_split_arg_str_value(cmd, "backward"))
+        scheduling_time_dict = runner.run(cmd, op_type, nvprof_start_step,
+                                          nvprof_end_step, backward)
     if with_nvprof:
         if is_ampere_gpu():
             runner = NsightRunner()
@@ -242,6 +391,15 @@ def _args_list_to_dict(arg_list):
 
     system.check_commit()
 
+    if use_gpu and task == "scheduling" and profiler == "none":
+        total_gpu_time = launch(
+            args.benchmark_script,
+            args.benchmark_script_args,
+            with_nvprof=False,
+            with_dynamic_scheduling=True)
+        args.benchmark_script_args.append(" --gpu_time ")
+        args.benchmark_script_args.append(str(total_gpu_time))
+
     if use_gpu and task == "speed" and profiler == "none":
         total_gpu_time = launch(
             args.benchmark_script,

diff --git a/api/common/main.py b/api/common/main.py
@@ -45,7 +45,7 @@ def parse_args():
         '--task',
         type=str,
         default="speed",
-        help='Specify the task: [speed|accuracy]')
+        help='Specify the task: [speed|accuracy|scheduling]')
     parser.add_argument(
         '--testing_mode',
         type=str,
@@ -99,6 +99,19 @@ def parse_args():
         help='Total GPU kernel time parsed from nvprof')
     parser.add_argument(
         '--repeat', type=int, default=1, help='Iterations of Repeat running')
+    parser.add_argument(
+        '--is_dynamic_scheduling',
+        type=system.str2bool,
+        default=False,
+        help='Whether to calculate scheduling cost in dynamic mode [True|False]'
+    )
+    parser.add_argument(
+        '--nvprof_start_step',
+        type=int,
+        default=1,
+        help='Start step of profile')
+    parser.add_argument(
+        '--nvprof_end_step', type=int, default=100, help='End step of profile')
     parser.add_argument(
         '--allow_adaptive_repeat',
         type=system.str2bool,
@@ -107,8 +120,8 @@ def parse_args():
     parser.add_argument(
         '--log_level', type=int, default=0, help='level of logging')
     args = parser.parse_args()
-    if args.task not in ["speed", "accuracy"]:
-        raise ValueError("task should be speed, accuracy")
+    if args.task not in ["speed", "accuracy", "scheduling"]:
+        raise ValueError("task should be speed, accuracy, scheduling")
     if args.framework not in [
             "paddle", "tensorflow", "tf", "pytorch", "torch", "both"
     ]:
@@ -302,6 +315,23 @@ def test_main_without_json(pd_obj=None,
         if pd_dy_outputs == False:
             sys.exit(1)
 
+    if args.is_dynamic_scheduling and _is_paddle_enabled(
+            args, config) and args.testing_mode == "dynamic":
+        assert pd_dy_obj is not None, "Paddle dynamic object is None."
+        print(config)
+        pd_dy_outputs, pd_dy_stats = pd_dy_obj.run(config, args,
+                                                   feeder_adapter)
+
+        if args.task == "speed":
+            pd_dy_stats["gpu_time"] = args.gpu_time
+            utils.print_benchmark_result(
+                pd_dy_stats,
+                log_level=args.log_level,
+                config_params=config.to_string())
+
+        if pd_dy_outputs == False:
+            sys.exit(1)
+
     if args.task == "accuracy":
         is_run_tf = config.run_tf and args.testing_mode == "static"
         is_run_torch = config.run_torch and args.testing_mode == "dynamic"