Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add timer for Paddle optimizer (#7563) (#9128) #9147

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/trainer.md
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,11 @@ Trainer 是一个简单,但功能完整的 Paddle 训练和评估模块,并
async_save, enable asynchronous saving checkpoints to disk.
enable_all_options, enable all unified checkpoint optimization configs.

--enable_optimizer_timer
是否开启Optimzier的timer 统计。(可选,默认为False,不开启)
Whether to enable Optimzier's timer profiler.
(optional, default is False, not enabled)

--skip_memory_metrics
是否跳过内存profiler检测。(可选,默认为True,跳过)
Whether or not to skip adding of memory profiler reports
Expand Down
19 changes: 15 additions & 4 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1291,11 +1291,9 @@
get_timers as paddle_get_timers,
)

paddle_pipeline_timers = paddle_get_timers()
for name, timer in paddle_pipeline_timers.timers.items():
elapsed_time = timer.elapsed(reset=False) * 1000.0
for name, timer in paddle_get_timers().timers.items():
elapsed_time = timer.elapsed(reset=True) * 1000.0

Check warning on line 1295 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1295

Added line #L1295 was not covered by tests
paddle_timer_info += f" | {name}: {elapsed_time:.2f}"
paddle_pipeline_timers.log(paddle_pipeline_timers.timers.keys(), reset=True)
except ImportError: # paddle version too old, timer not support
warnings.warn(f"paddle version:{paddle.__git_commit__} does not support pipeline timer")
except AssertionError: # paddle timer not enabled
Expand Down Expand Up @@ -2153,16 +2151,29 @@

model.train()
inputs = self._prepare_inputs(inputs)

# obtain current acc step
if not hasattr(self, "_cur_acc_step"):
self._cur_acc_step = 0

if self._cur_acc_step == self.args.gradient_accumulation_steps:
self._cur_acc_step = 0

Check warning on line 2160 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L2160

Added line #L2160 was not covered by tests

self.timers and self.timers(f"forward-acc-{self._cur_acc_step}").start()
with self.autocast_smart_context_manager():
loss = self.compute_loss(model, inputs)

if self.args.gradient_accumulation_steps > 1 and not self._enable_delay_scale_loss():
loss = loss / self.args.gradient_accumulation_steps

self.timers and self.timers(f"forward-acc-{self._cur_acc_step}").stop()

self.timers and self.timers(f"backward-acc-{self._cur_acc_step}").start()
if self.do_grad_scaling:
self.scaler.scale(loss).backward()
else:
loss.backward()
self.timers and self.timers(f"backward-acc-{self._cur_acc_step}").stop()
return loss.detach()

def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
Expand Down
13 changes: 13 additions & 0 deletions paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,10 @@
)
},
)
enable_optimizer_timer: Optional[bool] = field(
default=False,
metadata={"help": "是否开启Optimzier的timer统计"},
)
ignore_load_lr_and_optim: Optional[bool] = field(
default=False,
metadata={"help": "whether to ignore load optimizer and scheduler."},
Expand Down Expand Up @@ -1267,6 +1271,15 @@
"order": order,
}

try:
if self.enable_optimizer_timer:
hybrid_configs["enable_optimizer_timer"] = True
except (KeyError, AttributeError):
warnings.warn(

Check warning on line 1278 in paddlenlp/trainer/training_args.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/training_args.py#L1274-L1278

Added lines #L1274 - L1278 were not covered by tests
"The enable_optimizer_timer is not supported "
"by current version of Paddle. Please try latest develop Paddle."
)

if self.pipeline_parallel_degree > 1:
hybrid_configs["pp_configs"] = dygraph_pp_configs
logger.info(f"using pipeline configs:{dygraph_pp_configs}")
Expand Down
Loading