Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DO NOT MERGE] add scheduling evaluation in paddle dygraph mode #1240

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 159 additions & 1 deletion api/common/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,143 @@ def _parse_gpu_time(self, line):
return gpu_time / percent


def launch(benchmark_script, benchmark_script_args, with_nvprof=False):
class NsightRunnerForDynamicScheduling(object):
def run(self, cmd, op_type, nvprof_start_step, nvprof_end_step, backward):
stdout, exit_code = self._nsight_for_dynamic_scheduling(cmd)
if exit_code == 0:
parse_status, scheduling_time_dict = self._parse_logs(
stdout.split("\n"), op_type, nvprof_start_step,
nvprof_end_step, backward)
if parse_status:
return scheduling_time_dict
print("Running Error:\n {}".format(stdout))
return {}

def _nsight_for_dynamic_scheduling(self, cmd):
return system.run_command(
"nsys profile -t cuda,nvtx --stats true -o tmp.qdrep --force-overwrite true {}".
format(cmd))

def _to_float(self, s):
return float(s.replace(',', ''))

def _calculate_avg_time(self, l):
total_time = self._to_float(l[1])
max_time = self._to_float(l[5])
calls = self._to_float(l[2]) - 1
return (total_time - max_time) / calls

def _parse_logs(self, logs, op_type, nvprof_start_step, nvprof_end_step,
backward):
# print("yoki", logs)
flag_nvtx_time = False
total_step_time = 0.0
step_count = 0
parse_status = False

# 0: imperative (imperative_avg_time)
# 1: op_type (fwd_trace_op_avg_time)
# 2: op_type compute (fwd_op_compute_avg_time)
# 3: op_type_grad (bwd_trace_op_avg_time)
# 4: op_type_grad compute (bwd_op_compute_avg_time)
_scheduling_list = [
'imperative', op_type, op_type + ' compute', op_type + '_grad',
op_type + '_grad compute'
]
nvtx_meta_data_dict = {}
scheduling_time_dict = {}

for i in range(len(logs)):
line = api_param.parse_string(logs[i])
if flag_nvtx_time:
infos = line.strip().split()
if not infos:
continue
nvtx_range_type = infos[-1]
if nvtx_range_type == 'compute' or nvtx_range_type == 'infer_shape':
nvtx_range_type = infos[-2] + ' ' + nvtx_range_type

# step time
if nvtx_range_type.isdigit() and int(
nvtx_range_type) > nvprof_start_step and int(
nvtx_range_type) < nvprof_end_step:
step_count += 1
step_time = self._to_float(infos[1])
total_step_time += step_time

if nvtx_range_type in _scheduling_list:
avg_time = self._calculate_avg_time(infos)
nvtx_meta_data_dict[nvtx_range_type] = avg_time
# print(nvtx_range_type + ' time: ', avg_time)

if 'NVTX Push-Pop Range Statistics:' in line:
flag_nvtx_time = True
if step_count != 0:
nvtx_meta_data_dict['step'] = total_step_time / step_count
# print("num_step: ", step_count, " step_avg_time: ", total_step_time / step_count)

scheduling_time_dict['step_avg_time'] = nvtx_meta_data_dict[
'step'] if 'step' in nvtx_meta_data_dict else None
scheduling_time_dict['imperative_avg_time'] = nvtx_meta_data_dict[
_scheduling_list[0]] if _scheduling_list[
0] in nvtx_meta_data_dict else None
scheduling_time_dict['fwd_trace_op_avg_time'] = nvtx_meta_data_dict[
_scheduling_list[1]] if _scheduling_list[
1] in nvtx_meta_data_dict else None
scheduling_time_dict['fwd_op_compute_avg_time'] = nvtx_meta_data_dict[
_scheduling_list[2]] if _scheduling_list[
2] in nvtx_meta_data_dict else None
scheduling_time_dict['bwd_trace_op_avg_time'] = nvtx_meta_data_dict[
_scheduling_list[3]] if _scheduling_list[
3] in nvtx_meta_data_dict else None
scheduling_time_dict['bwd_op_compute_avg_time'] = nvtx_meta_data_dict[
_scheduling_list[4]] if _scheduling_list[
4] in nvtx_meta_data_dict else None
if scheduling_time_dict['step_avg_time'] and scheduling_time_dict[
'imperative_avg_time']:
if not backward:
scheduling_time_dict[
'python_call_time'] = scheduling_time_dict[
'step_avg_time'] - scheduling_time_dict[
'imperative_avg_time']
elif scheduling_time_dict['bwd_trace_op_avg_time']:
scheduling_time_dict[
'python_call_time'] = scheduling_time_dict[
'step_avg_time'] - scheduling_time_dict[
'imperative_avg_time'] - scheduling_time_dict[
'bwd_trace_op_avg_time']
if scheduling_time_dict[
'imperative_avg_time'] and scheduling_time_dict[
'fwd_trace_op_avg_time']:
scheduling_time_dict[
'imperative_call_time'] = scheduling_time_dict[
'imperative_avg_time'] - scheduling_time_dict[
'fwd_trace_op_avg_time']
if scheduling_time_dict[
'fwd_trace_op_avg_time'] and scheduling_time_dict[
'fwd_op_compute_avg_time']:
scheduling_time_dict[
'fwd_trace_op_call_time'] = scheduling_time_dict[
'fwd_trace_op_avg_time'] - scheduling_time_dict[
'fwd_op_compute_avg_time']
if scheduling_time_dict[
'bwd_trace_op_avg_time'] and scheduling_time_dict[
'bwd_op_compute_avg_time']:
scheduling_time_dict[
'bwd_trace_op_call_time'] = scheduling_time_dict[
'bwd_trace_op_avg_time'] - scheduling_time_dict[
'bwd_op_compute_avg_time']

parse_status = True

print(scheduling_time_dict)
return parse_status, scheduling_time_dict


def launch(benchmark_script,
benchmark_script_args,
with_nvprof=False,
with_dynamic_scheduling=False):
"""
If with_nvprof is True, it will launch the following command firstly to
get the gpu_time:
Expand All @@ -188,10 +324,23 @@ def _set_profiler(args, value):
args.append("--profiler")
args.append(value)

def _split_arg_str_value(cmd, arg_name):
if arg_name not in cmd:
return None
return cmd.split("--" + arg_name)[1].strip().split()[0]

if with_nvprof:
_set_profiler(benchmark_script_args, "nvprof")
cmd = "{} {} {}".format(sys.executable, benchmark_script,
" ".join(benchmark_script_args))
if with_dynamic_scheduling:
runner = NsightRunnerForDynamicScheduling()
nvprof_start_step = int(_split_arg_str_value(cmd, "nvprof_start_step"))
nvprof_end_step = int(_split_arg_str_value(cmd, "nvprof_end_step"))
op_type = _split_arg_str_value(cmd, "api_name")
backward = bool(_split_arg_str_value(cmd, "backward"))
scheduling_time_dict = runner.run(cmd, op_type, nvprof_start_step,
nvprof_end_step, backward)
if with_nvprof:
if is_ampere_gpu():
runner = NsightRunner()
Expand Down Expand Up @@ -242,6 +391,15 @@ def _args_list_to_dict(arg_list):

system.check_commit()

if use_gpu and task == "scheduling" and profiler == "none":
total_gpu_time = launch(
args.benchmark_script,
args.benchmark_script_args,
with_nvprof=False,
with_dynamic_scheduling=True)
args.benchmark_script_args.append(" --gpu_time ")
args.benchmark_script_args.append(str(total_gpu_time))

if use_gpu and task == "speed" and profiler == "none":
total_gpu_time = launch(
args.benchmark_script,
Expand Down
36 changes: 33 additions & 3 deletions api/common/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def parse_args():
'--task',
type=str,
default="speed",
help='Specify the task: [speed|accuracy]')
help='Specify the task: [speed|accuracy|scheduling]')
parser.add_argument(
'--testing_mode',
type=str,
Expand Down Expand Up @@ -99,6 +99,19 @@ def parse_args():
help='Total GPU kernel time parsed from nvprof')
parser.add_argument(
'--repeat', type=int, default=1, help='Iterations of Repeat running')
parser.add_argument(
'--is_dynamic_scheduling',
type=system.str2bool,
default=False,
help='Whether to calculate scheduling cost in dynamic mode [True|False]'
)
parser.add_argument(
'--nvprof_start_step',
type=int,
default=1,
help='Start step of profile')
parser.add_argument(
'--nvprof_end_step', type=int, default=100, help='End step of profile')
parser.add_argument(
'--allow_adaptive_repeat',
type=system.str2bool,
Expand All @@ -107,8 +120,8 @@ def parse_args():
parser.add_argument(
'--log_level', type=int, default=0, help='level of logging')
args = parser.parse_args()
if args.task not in ["speed", "accuracy"]:
raise ValueError("task should be speed, accuracy")
if args.task not in ["speed", "accuracy", "scheduling"]:
raise ValueError("task should be speed, accuracy, scheduling")
if args.framework not in [
"paddle", "tensorflow", "tf", "pytorch", "torch", "both"
]:
Expand Down Expand Up @@ -302,6 +315,23 @@ def test_main_without_json(pd_obj=None,
if pd_dy_outputs == False:
sys.exit(1)

if args.is_dynamic_scheduling and _is_paddle_enabled(
args, config) and args.testing_mode == "dynamic":
assert pd_dy_obj is not None, "Paddle dynamic object is None."
print(config)
pd_dy_outputs, pd_dy_stats = pd_dy_obj.run(config, args,
feeder_adapter)

if args.task == "speed":
pd_dy_stats["gpu_time"] = args.gpu_time
utils.print_benchmark_result(
pd_dy_stats,
log_level=args.log_level,
config_params=config.to_string())

if pd_dy_outputs == False:
sys.exit(1)

if args.task == "accuracy":
is_run_tf = config.run_tf and args.testing_mode == "static"
is_run_torch = config.run_torch and args.testing_mode == "dynamic"
Expand Down
Loading