diff --git a/README.md b/README.md index d19b3728a..1f89c5d13 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **69 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported! +Currently, **70 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -93,79 +93,80 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | -| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | -| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | -| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | -| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | +| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | +| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | ## Supported LR Scheduler diff --git a/docs/changelogs/v3.0.2.md b/docs/changelogs/v3.0.2.md index b2d1c8a2b..2b2fc7029 100644 --- a/docs/changelogs/v3.0.2.md +++ b/docs/changelogs/v3.0.2.md @@ -5,6 +5,8 @@ * Implement `WSD` LR Scheduler. (#247, #248) * [Warmup-Stable-Decay LR Scheduler](https://arxiv.org/abs/2404.06395) * Add more Pytorch built-in lr schedulers. (#248) +* Implement `Kate` optimizer. (#249, #251) + * [Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad](https://arxiv.org/abs/2403.02648) ### Refactor diff --git a/docs/index.md b/docs/index.md index d19b3728a..1f89c5d13 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **69 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported! +Currently, **70 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -93,79 +93,80 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | -| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | -| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | -| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | -| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | +| FAdam | *Adam is a natural gradient optimizer using diagonal empirical Fisher information* | [github](https://github.com/lessw2020/fadam_pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation) | +| Grokfast | *Accelerated Grokking by Amplifying Slow Gradients* | [github](https://github.com/ironjr/grokfast) | | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation) | +| Kate | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad* | [github](https://github.com/nazya/KATE) | | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) | ## Supported LR Scheduler diff --git a/docs/optimizer.md b/docs/optimizer.md index 94f94f933..3d74a4365 100644 --- a/docs/optimizer.md +++ b/docs/optimizer.md @@ -172,6 +172,10 @@ :docstring: :members: +::: pytorch_optimizer.Kate + :docstring: + :members: + ::: pytorch_optimizer.Lamb :docstring: :members: diff --git a/poetry.lock b/poetry.lock index de3c4df26..da0f176fa 100644 --- a/poetry.lock +++ b/poetry.lock @@ -547,29 +547,29 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] [[package]] name = "ruff" -version = "0.5.0" +version = "0.5.1" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.5.0-py3-none-linux_armv6l.whl", hash = "sha256:ee770ea8ab38918f34e7560a597cc0a8c9a193aaa01bfbd879ef43cb06bd9c4c"}, - {file = "ruff-0.5.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38f3b8327b3cb43474559d435f5fa65dacf723351c159ed0dc567f7ab735d1b6"}, - {file = "ruff-0.5.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7594f8df5404a5c5c8f64b8311169879f6cf42142da644c7e0ba3c3f14130370"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc7012d6ec85032bc4e9065110df205752d64010bed5f958d25dbee9ce35de3"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d505fb93b0fabef974b168d9b27c3960714d2ecda24b6ffa6a87ac432905ea38"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dc5cfd3558f14513ed0d5b70ce531e28ea81a8a3b1b07f0f48421a3d9e7d80a"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:db3ca35265de239a1176d56a464b51557fce41095c37d6c406e658cf80bbb362"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1a321c4f68809fddd9b282fab6a8d8db796b270fff44722589a8b946925a2a8"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c4dfcd8d34b143916994b3876b63d53f56724c03f8c1a33a253b7b1e6bf2a7d"}, - {file = "ruff-0.5.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81e5facfc9f4a674c6a78c64d38becfbd5e4f739c31fcd9ce44c849f1fad9e4c"}, - {file = "ruff-0.5.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e589e27971c2a3efff3fadafb16e5aef7ff93250f0134ec4b52052b673cf988d"}, - {file = "ruff-0.5.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2ffbc3715a52b037bcb0f6ff524a9367f642cdc5817944f6af5479bbb2eb50e"}, - {file = "ruff-0.5.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cd096e23c6a4f9c819525a437fa0a99d1c67a1b6bb30948d46f33afbc53596cf"}, - {file = "ruff-0.5.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:46e193b36f2255729ad34a49c9a997d506e58f08555366b2108783b3064a0e1e"}, - {file = "ruff-0.5.0-py3-none-win32.whl", hash = "sha256:49141d267100f5ceff541b4e06552e98527870eafa1acc9dec9139c9ec5af64c"}, - {file = "ruff-0.5.0-py3-none-win_amd64.whl", hash = "sha256:e9118f60091047444c1b90952736ee7b1792910cab56e9b9a9ac20af94cd0440"}, - {file = "ruff-0.5.0-py3-none-win_arm64.whl", hash = "sha256:ed5c4df5c1fb4518abcb57725b576659542bdbe93366f4f329e8f398c4b71178"}, - {file = "ruff-0.5.0.tar.gz", hash = "sha256:eb641b5873492cf9bd45bc9c5ae5320648218e04386a5f0c264ad6ccce8226a1"}, + {file = "ruff-0.5.1-py3-none-linux_armv6l.whl", hash = "sha256:6ecf968fcf94d942d42b700af18ede94b07521bd188aaf2cd7bc898dd8cb63b6"}, + {file = "ruff-0.5.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:204fb0a472f00f2e6280a7c8c7c066e11e20e23a37557d63045bf27a616ba61c"}, + {file = "ruff-0.5.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d235968460e8758d1e1297e1de59a38d94102f60cafb4d5382033c324404ee9d"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38beace10b8d5f9b6bdc91619310af6d63dd2019f3fb2d17a2da26360d7962fa"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e478d2f09cf06add143cf8c4540ef77b6599191e0c50ed976582f06e588c994"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0368d765eec8247b8550251c49ebb20554cc4e812f383ff9f5bf0d5d94190b0"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:3a9a9a1b582e37669b0138b7c1d9d60b9edac880b80eb2baba6d0e566bdeca4d"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bdd9f723e16003623423affabcc0a807a66552ee6a29f90eddad87a40c750b78"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:be9fd62c1e99539da05fcdc1e90d20f74aec1b7a1613463ed77870057cd6bd96"}, + {file = "ruff-0.5.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e216fc75a80ea1fbd96af94a6233d90190d5b65cc3d5dfacf2bd48c3e067d3e1"}, + {file = "ruff-0.5.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c4c2112e9883a40967827d5c24803525145e7dab315497fae149764979ac7929"}, + {file = "ruff-0.5.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dfaf11c8a116394da3b65cd4b36de30d8552fa45b8119b9ef5ca6638ab964fa3"}, + {file = "ruff-0.5.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d7ceb9b2fe700ee09a0c6b192c5ef03c56eb82a0514218d8ff700f6ade004108"}, + {file = "ruff-0.5.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bac6288e82f6296f82ed5285f597713acb2a6ae26618ffc6b429c597b392535c"}, + {file = "ruff-0.5.1-py3-none-win32.whl", hash = "sha256:5c441d9c24ec09e1cb190a04535c5379b36b73c4bc20aa180c54812c27d1cca4"}, + {file = "ruff-0.5.1-py3-none-win_amd64.whl", hash = "sha256:b1789bf2cd3d1b5a7d38397cac1398ddf3ad7f73f4de01b1e913e2abc7dfc51d"}, + {file = "ruff-0.5.1-py3-none-win_arm64.whl", hash = "sha256:2875b7596a740cbbd492f32d24be73e545a4ce0a3daf51e4f4e609962bfd3cd2"}, + {file = "ruff-0.5.1.tar.gz", hash = "sha256:3164488aebd89b1745b47fd00604fb4358d774465f20d1fcd907f9c0fc1b0655"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index f9f579bc6..f00a6429d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,8 +14,8 @@ keywords = [ "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "Adalite", "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "FAdam", "Fromage", "GaLore", "Gravity", - "GrokFast", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", - "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", + "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", + "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SopihaH", "SRMM", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py index 3dd59f58c..340232fe6 100644 --- a/pytorch_optimizer/__init__.py +++ b/pytorch_optimizer/__init__.py @@ -66,6 +66,7 @@ from pytorch_optimizer.optimizer.gc import centralize_gradient from pytorch_optimizer.optimizer.gravity import Gravity from pytorch_optimizer.optimizer.grokfast import GrokFastAdamW, gradfilter_ema, gradfilter_ma +from pytorch_optimizer.optimizer.kate import Kate from pytorch_optimizer.optimizer.lamb import Lamb from pytorch_optimizer.optimizer.lars import LARS from pytorch_optimizer.optimizer.lion import Lion @@ -199,6 +200,7 @@ ScheduleFreeAdamW, FAdam, GrokFastAdamW, + Kate, ] OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST} diff --git a/pytorch_optimizer/optimizer/kate.py b/pytorch_optimizer/optimizer/kate.py new file mode 100644 index 000000000..5ea26c0f9 --- /dev/null +++ b/pytorch_optimizer/optimizer/kate.py @@ -0,0 +1,109 @@ +import torch +from torch.optim.optimizer import Optimizer + +from pytorch_optimizer.base.exception import NoSparseGradientError +from pytorch_optimizer.base.optimizer import BaseOptimizer +from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, PARAMETERS + + +class Kate(Optimizer, BaseOptimizer): + r"""Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad. + + :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups. + :param lr: float. learning rate. + :param delta: float. delta. 0.0 or 1e-8. + :param weight_decay: float. weight decay (L2 penalty). + :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW. + :param fixed_decay: bool. fix weight decay. + :param eps: float. epsilon value. + """ + + def __init__( + self, + params: PARAMETERS, + lr: float = 1e-3, + delta: float = 0.0, + weight_decay: float = 0.0, + weight_decouple: bool = True, + fixed_decay: bool = False, + eps: float = 1e-8, + ): + self.validate_learning_rate(lr) + self.validate_range(delta, 'delta', 0.0, 1.0, '[)') + self.validate_non_negative(weight_decay, 'weight_decay') + self.validate_non_negative(eps, 'eps') + + defaults: DEFAULTS = { + 'lr': lr, + 'delta': delta, + 'weight_decay': weight_decay, + 'weight_decouple': weight_decouple, + 'fixed_decay': fixed_decay, + 'eps': eps, + } + + super().__init__(params, defaults) + + def __str__(self) -> str: + return 'Kate' + + @torch.no_grad() + def reset(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + state = self.state[p] + + state['m'] = torch.zeros_like(p) + state['b'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self, closure: CLOSURE = None) -> LOSS: + loss: LOSS = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad + if grad.is_sparse: + raise NoSparseGradientError(str(self)) + + state = self.state[p] + + if len(state) == 0: + state['m'] = torch.zeros_like(p) + state['b'] = torch.zeros_like(p) + + self.apply_weight_decay( + p=p, + grad=p.grad, + lr=group['lr'], + weight_decay=group['weight_decay'], + weight_decouple=group['weight_decouple'], + fixed_decay=group['fixed_decay'], + ) + + grad_p2 = torch.mul(grad, grad) + + m, b = state['m'], state['b'] + b.mul_(b).add_(grad_p2).add_(group['eps']) + + m.mul_(m).add_(grad_p2, alpha=group['delta']).add_(grad_p2 / b).sqrt_() + + update = m.mul(grad).div_(b) + + p.add_(update, alpha=-group['lr']) + + b.sqrt_() + + return loss diff --git a/requirements-dev.txt b/requirements-dev.txt index 5295a31fe..da0c1d41b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -23,7 +23,7 @@ platformdirs==4.2.2 ; python_version >= "3.8" and python_full_version < "4.0.0" pluggy==1.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0" pytest-cov==5.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0" pytest==8.2.2 ; python_version >= "3.8" and python_full_version < "4.0.0" -ruff==0.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0" +ruff==0.5.1 ; python_version >= "3.8" and python_full_version < "4.0.0" sympy==1.12.1 ; python_version >= "3.8" and python_full_version < "4.0.0" tbb==2021.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" tomli==2.0.1 ; python_version >= "3.8" and python_full_version <= "3.11.0a6" diff --git a/tests/constants.py b/tests/constants.py index 4d3b1183a..7daf9d6a8 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -50,6 +50,7 @@ GaLore, Gravity, GrokFastAdamW, + Kate, Lamb, Lion, Nero, @@ -461,6 +462,7 @@ (ScheduleFreeAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 5), (FAdam, {'lr': 1e0, 'weight_decay': 1e-3}, 5), (GrokFastAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 10), + (Kate, {'lr': 5e-2}, 10), ] ADANORM_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [ (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'adanorm': True}, 10), diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py index 3b1620a38..d13f0877b 100644 --- a/tests/test_load_modules.py +++ b/tests/test_load_modules.py @@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names): def test_get_supported_optimizers(): - assert len(get_supported_optimizers()) == 68 + assert len(get_supported_optimizers()) == 69 def test_get_supported_lr_schedulers():