diff --git a/README.md b/README.md
index d19b3728a..1f89c5d13 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **69 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **70 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -93,79 +93,80 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                          |
-|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                      |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                  |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                        |
-| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                      |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                              |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                      |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                   |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                          |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                              |
-| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                               |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                      |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                      |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                      |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                      |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                  |
-| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                      |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                      |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                           |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                      |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                      |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                      |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                      |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)              |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                      |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                 |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                              |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                      |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                           |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                        |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                        |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                      |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                  |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                 |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                               |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                      |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                     |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                        |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                      |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                      |
-| SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                      |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                      |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                           |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                      |
-| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                      |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                             |
-| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                           |
-| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                      |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                      |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                      |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                      |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                      |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                      |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                      |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                      |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                      |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                       |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                          |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                  |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                           |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                        |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                   |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)         |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                             |
-| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                       |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                      |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                         |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                      |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                            |
+| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
 
 ## Supported LR Scheduler
 
diff --git a/docs/changelogs/v3.0.2.md b/docs/changelogs/v3.0.2.md
index b2d1c8a2b..2b2fc7029 100644
--- a/docs/changelogs/v3.0.2.md
+++ b/docs/changelogs/v3.0.2.md
@@ -5,6 +5,8 @@
 * Implement `WSD` LR Scheduler. (#247, #248)
   * [Warmup-Stable-Decay LR Scheduler](https://arxiv.org/abs/2404.06395)
 * Add more Pytorch built-in lr schedulers. (#248)
+* Implement `Kate` optimizer. (#249, #251)
+  * [Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad](https://arxiv.org/abs/2403.02648) 
 
 ### Refactor
 
diff --git a/docs/index.md b/docs/index.md
index d19b3728a..1f89c5d13 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **69 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **70 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -93,79 +93,80 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                          |
-|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                      |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                  |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                        |
-| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                      |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                              |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                      |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                   |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                          |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                              |
-| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                               |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                      |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                      |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                      |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                      |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                  |
-| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                      |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                      |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                           |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                      |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                      |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                      |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                      |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)              |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                      |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                 |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                              |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                      |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                           |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                        |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                        |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                      |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                  |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                 |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                               |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                      |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                     |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                        |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                      |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                      |
-| SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                      |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                      |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                           |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                      |
-| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                      |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                             |
-| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                           |
-| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                      |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                      |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                      |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                      |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                      |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                      |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                      |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                      |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                      |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                       |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                          |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                  |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                           |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                        |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                   |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)         |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                             |
-| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                       |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                      |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                         |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                      |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                            |
+| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
 
 ## Supported LR Scheduler
 
diff --git a/docs/optimizer.md b/docs/optimizer.md
index 94f94f933..3d74a4365 100644
--- a/docs/optimizer.md
+++ b/docs/optimizer.md
@@ -172,6 +172,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.Kate
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.Lamb
     :docstring:
     :members:
diff --git a/poetry.lock b/poetry.lock
index de3c4df26..da0f176fa 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -547,29 +547,29 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
 
 [[package]]
 name = "ruff"
-version = "0.5.0"
+version = "0.5.1"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.5.0-py3-none-linux_armv6l.whl", hash = "sha256:ee770ea8ab38918f34e7560a597cc0a8c9a193aaa01bfbd879ef43cb06bd9c4c"},
-    {file = "ruff-0.5.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:38f3b8327b3cb43474559d435f5fa65dacf723351c159ed0dc567f7ab735d1b6"},
-    {file = "ruff-0.5.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:7594f8df5404a5c5c8f64b8311169879f6cf42142da644c7e0ba3c3f14130370"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:adc7012d6ec85032bc4e9065110df205752d64010bed5f958d25dbee9ce35de3"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d505fb93b0fabef974b168d9b27c3960714d2ecda24b6ffa6a87ac432905ea38"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dc5cfd3558f14513ed0d5b70ce531e28ea81a8a3b1b07f0f48421a3d9e7d80a"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:db3ca35265de239a1176d56a464b51557fce41095c37d6c406e658cf80bbb362"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b1a321c4f68809fddd9b282fab6a8d8db796b270fff44722589a8b946925a2a8"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2c4dfcd8d34b143916994b3876b63d53f56724c03f8c1a33a253b7b1e6bf2a7d"},
-    {file = "ruff-0.5.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81e5facfc9f4a674c6a78c64d38becfbd5e4f739c31fcd9ce44c849f1fad9e4c"},
-    {file = "ruff-0.5.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e589e27971c2a3efff3fadafb16e5aef7ff93250f0134ec4b52052b673cf988d"},
-    {file = "ruff-0.5.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2ffbc3715a52b037bcb0f6ff524a9367f642cdc5817944f6af5479bbb2eb50e"},
-    {file = "ruff-0.5.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:cd096e23c6a4f9c819525a437fa0a99d1c67a1b6bb30948d46f33afbc53596cf"},
-    {file = "ruff-0.5.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:46e193b36f2255729ad34a49c9a997d506e58f08555366b2108783b3064a0e1e"},
-    {file = "ruff-0.5.0-py3-none-win32.whl", hash = "sha256:49141d267100f5ceff541b4e06552e98527870eafa1acc9dec9139c9ec5af64c"},
-    {file = "ruff-0.5.0-py3-none-win_amd64.whl", hash = "sha256:e9118f60091047444c1b90952736ee7b1792910cab56e9b9a9ac20af94cd0440"},
-    {file = "ruff-0.5.0-py3-none-win_arm64.whl", hash = "sha256:ed5c4df5c1fb4518abcb57725b576659542bdbe93366f4f329e8f398c4b71178"},
-    {file = "ruff-0.5.0.tar.gz", hash = "sha256:eb641b5873492cf9bd45bc9c5ae5320648218e04386a5f0c264ad6ccce8226a1"},
+    {file = "ruff-0.5.1-py3-none-linux_armv6l.whl", hash = "sha256:6ecf968fcf94d942d42b700af18ede94b07521bd188aaf2cd7bc898dd8cb63b6"},
+    {file = "ruff-0.5.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:204fb0a472f00f2e6280a7c8c7c066e11e20e23a37557d63045bf27a616ba61c"},
+    {file = "ruff-0.5.1-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d235968460e8758d1e1297e1de59a38d94102f60cafb4d5382033c324404ee9d"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38beace10b8d5f9b6bdc91619310af6d63dd2019f3fb2d17a2da26360d7962fa"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e478d2f09cf06add143cf8c4540ef77b6599191e0c50ed976582f06e588c994"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0368d765eec8247b8550251c49ebb20554cc4e812f383ff9f5bf0d5d94190b0"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:3a9a9a1b582e37669b0138b7c1d9d60b9edac880b80eb2baba6d0e566bdeca4d"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bdd9f723e16003623423affabcc0a807a66552ee6a29f90eddad87a40c750b78"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:be9fd62c1e99539da05fcdc1e90d20f74aec1b7a1613463ed77870057cd6bd96"},
+    {file = "ruff-0.5.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e216fc75a80ea1fbd96af94a6233d90190d5b65cc3d5dfacf2bd48c3e067d3e1"},
+    {file = "ruff-0.5.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:c4c2112e9883a40967827d5c24803525145e7dab315497fae149764979ac7929"},
+    {file = "ruff-0.5.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dfaf11c8a116394da3b65cd4b36de30d8552fa45b8119b9ef5ca6638ab964fa3"},
+    {file = "ruff-0.5.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:d7ceb9b2fe700ee09a0c6b192c5ef03c56eb82a0514218d8ff700f6ade004108"},
+    {file = "ruff-0.5.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:bac6288e82f6296f82ed5285f597713acb2a6ae26618ffc6b429c597b392535c"},
+    {file = "ruff-0.5.1-py3-none-win32.whl", hash = "sha256:5c441d9c24ec09e1cb190a04535c5379b36b73c4bc20aa180c54812c27d1cca4"},
+    {file = "ruff-0.5.1-py3-none-win_amd64.whl", hash = "sha256:b1789bf2cd3d1b5a7d38397cac1398ddf3ad7f73f4de01b1e913e2abc7dfc51d"},
+    {file = "ruff-0.5.1-py3-none-win_arm64.whl", hash = "sha256:2875b7596a740cbbd492f32d24be73e545a4ce0a3daf51e4f4e609962bfd3cd2"},
+    {file = "ruff-0.5.1.tar.gz", hash = "sha256:3164488aebd89b1745b47fd00604fb4358d774465f20d1fcd907f9c0fc1b0655"},
 ]
 
 [[package]]
diff --git a/pyproject.toml b/pyproject.toml
index f9f579bc6..f00a6429d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,8 +14,8 @@ keywords = [
     "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "Adalite",
     "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad",
     "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "FAdam", "Fromage", "GaLore", "Gravity",
-    "GrokFast", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam",
-    "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM",
+    "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad",
+    "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM",
     "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SopihaH",
     "SRMM", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM",
     "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD",
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
index 3dd59f58c..340232fe6 100644
--- a/pytorch_optimizer/__init__.py
+++ b/pytorch_optimizer/__init__.py
@@ -66,6 +66,7 @@
 from pytorch_optimizer.optimizer.gc import centralize_gradient
 from pytorch_optimizer.optimizer.gravity import Gravity
 from pytorch_optimizer.optimizer.grokfast import GrokFastAdamW, gradfilter_ema, gradfilter_ma
+from pytorch_optimizer.optimizer.kate import Kate
 from pytorch_optimizer.optimizer.lamb import Lamb
 from pytorch_optimizer.optimizer.lars import LARS
 from pytorch_optimizer.optimizer.lion import Lion
@@ -199,6 +200,7 @@
     ScheduleFreeAdamW,
     FAdam,
     GrokFastAdamW,
+    Kate,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
diff --git a/pytorch_optimizer/optimizer/kate.py b/pytorch_optimizer/optimizer/kate.py
new file mode 100644
index 000000000..5ea26c0f9
--- /dev/null
+++ b/pytorch_optimizer/optimizer/kate.py
@@ -0,0 +1,109 @@
+import torch
+from torch.optim.optimizer import Optimizer
+
+from pytorch_optimizer.base.exception import NoSparseGradientError
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, PARAMETERS
+
+
+class Kate(Optimizer, BaseOptimizer):
+    r"""Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad.
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param delta: float. delta. 0.0 or 1e-8.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
+    :param fixed_decay: bool. fix weight decay.
+    :param eps: float. epsilon value.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1e-3,
+        delta: float = 0.0,
+        weight_decay: float = 0.0,
+        weight_decouple: bool = True,
+        fixed_decay: bool = False,
+        eps: float = 1e-8,
+    ):
+        self.validate_learning_rate(lr)
+        self.validate_range(delta, 'delta', 0.0, 1.0, '[)')
+        self.validate_non_negative(weight_decay, 'weight_decay')
+        self.validate_non_negative(eps, 'eps')
+
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'delta': delta,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
+            'fixed_decay': fixed_decay,
+            'eps': eps,
+        }
+
+        super().__init__(params, defaults)
+
+    def __str__(self) -> str:
+        return 'Kate'
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                state = self.state[p]
+
+                state['m'] = torch.zeros_like(p)
+                state['b'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                if grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['m'] = torch.zeros_like(p)
+                    state['b'] = torch.zeros_like(p)
+
+                self.apply_weight_decay(
+                    p=p,
+                    grad=p.grad,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    weight_decouple=group['weight_decouple'],
+                    fixed_decay=group['fixed_decay'],
+                )
+
+                grad_p2 = torch.mul(grad, grad)
+
+                m, b = state['m'], state['b']
+                b.mul_(b).add_(grad_p2).add_(group['eps'])
+
+                m.mul_(m).add_(grad_p2, alpha=group['delta']).add_(grad_p2 / b).sqrt_()
+
+                update = m.mul(grad).div_(b)
+
+                p.add_(update, alpha=-group['lr'])
+
+                b.sqrt_()
+
+        return loss
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5295a31fe..da0c1d41b 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -23,7 +23,7 @@ platformdirs==4.2.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 pluggy==1.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 pytest-cov==5.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 pytest==8.2.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
-ruff==0.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
+ruff==0.5.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 sympy==1.12.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 tbb==2021.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows"
 tomli==2.0.1 ; python_version >= "3.8" and python_full_version <= "3.11.0a6"
diff --git a/tests/constants.py b/tests/constants.py
index 4d3b1183a..7daf9d6a8 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -50,6 +50,7 @@
     GaLore,
     Gravity,
     GrokFastAdamW,
+    Kate,
     Lamb,
     Lion,
     Nero,
@@ -461,6 +462,7 @@
     (ScheduleFreeAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (FAdam, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (GrokFastAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 10),
+    (Kate, {'lr': 5e-2}, 10),
 ]
 ADANORM_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'adanorm': True}, 10),
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
index 3b1620a38..d13f0877b 100644
--- a/tests/test_load_modules.py
+++ b/tests/test_load_modules.py
@@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 68
+    assert len(get_supported_optimizers()) == 69
 
 
 def test_get_supported_lr_schedulers():