From c24498b10900ce165a6242ce6f6fb1261c31a9cb Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Sat, 21 Dec 2024 02:47:41 +0800 Subject: [PATCH] pd: fix learning rate setting when resume (#4480) "When resuming training, there is no need to add `self.start_step` to the step count because Paddle uses `lr_sche.last_epoch` as the input for `step`, which already records the `start_step` steps." learning rate are correct after fixing ![22AD6874B74E437E9B133D75ABCC02FE](https://github.com/user-attachments/assets/1ad0ce71-6e1c-4de5-87dc-0daca1f6f038) ## Summary by CodeRabbit - **New Features** - Enhanced training process with improved optimizer configuration and learning rate adjustments. - Refined logging of training and validation results for clarity. - Improved model saving logic to preserve the latest state during interruptions. - Enhanced tensorboard logging for detailed tracking of training metrics. - **Bug Fixes** - Corrected lambda function for learning rate scheduler to reference warmup steps accurately. - **Chores** - Streamlined data loading and handling for efficient training across different tasks. --- deepmd/pd/train/training.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py index 65e35a1c4b..0f3c7a9732 100644 --- a/deepmd/pd/train/training.py +++ b/deepmd/pd/train/training.py @@ -588,15 +588,14 @@ def warm_up_linear(step, warmup_steps): if self.opt_type == "Adam": self.scheduler = paddle.optimizer.lr.LambdaDecay( learning_rate=self.lr_exp.start_lr, - lr_lambda=lambda step: warm_up_linear( - step + self.start_step, self.warmup_steps - ), + lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps), ) self.optimizer = paddle.optimizer.Adam( learning_rate=self.scheduler, parameters=self.wrapper.parameters() ) if optimizer_state_dict is not None and self.restart_training: self.optimizer.set_state_dict(optimizer_state_dict) + self.scheduler.last_epoch -= 1 else: raise ValueError(f"Not supported optimizer type '{self.opt_type}'")