diff --git a/README.md b/README.md
index 38fc998f1..89095bbe2 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **72 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -27,8 +27,9 @@ So, please double-check the license before using it at your work.
 $ pip3 install pytorch-optimizer
 ```
 
-From `v2.12.0`, you can install and import `bitsandbytes` optimizers. 
-please check [the requirements](https://github.com/TimDettmers/bitsandbytes?tab=readme-ov-file#tldr) before installing it.
+From `v2.12.0`, `v3.1.0`, you can use `bitsandbytes`, `q-galore-torch` optimizers respectively!
+please check [the bnb requirements](https://github.com/TimDettmers/bitsandbytes?tab=readme-ov-file#tldr), [q-galore-torch installation](https://github.com/VITA-Group/Q-GaLore?tab=readme-ov-file#install-q-galore-optimizer)
+ before installing it.
 
 From `v3.0.0`, drop `Python 3.7` support. However, you can still use this package with `Python 3.7` by installing with `--ignore-requires-python` option.
 
@@ -93,82 +94,83 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
-|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
-| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
-| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
-| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| Optimizer     | Description                                                                                      | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                        | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                  | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                        | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                               |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                   | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                       | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                         | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                       | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*             | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                          | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                     | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                  | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                    | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                         | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                               | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                   | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                          | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                              | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                    | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                             | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*  | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*     | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                  | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                       | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                         | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                     | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                  | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                   | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                          | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                          | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                      | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                    |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                             |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                      | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                      | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                  | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                         | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                           | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                         | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                          | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                      |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
 | SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
-| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
-| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
-| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
-| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
-| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
-| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                        |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
-| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                           | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                               | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                       | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                          |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                   | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                               | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                        | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                     | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                      | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                          |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                    |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                      |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                               |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                   | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                               | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*   | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                    | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                      | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                       | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*            | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*     | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                  | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                              | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                          | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                       | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*               | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                              | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                    | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
+| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                       |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
+| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                          | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
 
 ## Supported LR Scheduler
 
diff --git a/docs/changelogs/v3.1.0.md b/docs/changelogs/v3.1.0.md
new file mode 100644
index 000000000..61d03de06
--- /dev/null
+++ b/docs/changelogs/v3.1.0.md
@@ -0,0 +1,25 @@
+## Change Log
+
+### Feature
+
+* Implement `AdaLomo` optimizer. (#258)
+    * [Low-memory Optimization with Adaptive Learning Rate](https://arxiv.org/abs/2310.10195) 
+* Support `Q-GaLore` optimizer. (#258)
+    * [Q-GaLore: Quantized GaLore with INT4 Projection and Layer-Adaptive Low-Rank Gradients.](https://arxiv.org/abs/2407.08296)
+    * you can use by `optimizer = load_optimizer('q_galore_adamw8bit')`
+* Support more bnb optimizers. (#258)
+    * `bnb_paged_adam8bit`, `bnb_paged_adamw8bit`, `bnb_*_*32bit`.
+
+### Refactor
+
+* Refactor `AdamMini`. (#258)
+* Deprecate optional dependency, `bitsandbytes`. (#258)
+* Move `get_rms`, `approximate_sq_grad` functions to `BaseOptimizer` for reusability. (#258)
+
+### Bug
+
+* Fix several bugs in `AdamMini` optimizer. (#257)
+
+## Contributions
+
+thanks to @sdbds
diff --git a/docs/index.md b/docs/index.md
index 70a5acd0f..89095bbe2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -10,7 +10,7 @@
 
 **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. 
 I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.  
-Currently, **72 optimizers (+ `bitsandbytes`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
+Currently, **73 optimizers (+ `bitsandbytes`, `qgalore`)**, **16 lr schedulers**, and **13 loss functions** are supported!  
 
 Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer).
 
@@ -27,8 +27,9 @@ So, please double-check the license before using it at your work.
 $ pip3 install pytorch-optimizer
 ```
 
-From `v2.12.0`, you can install and import `bitsandbytes` optimizers. 
-please check [the requirements](https://github.com/TimDettmers/bitsandbytes?tab=readme-ov-file#tldr) before installing it.
+From `v2.12.0`, `v3.1.0`, you can use `bitsandbytes`, `q-galore-torch` optimizers respectively!
+please check [the bnb requirements](https://github.com/TimDettmers/bitsandbytes?tab=readme-ov-file#tldr), [q-galore-torch installation](https://github.com/VITA-Group/Q-GaLore?tab=readme-ov-file#install-q-galore-optimizer)
+ before installing it.
 
 From `v3.0.0`, drop `Python 3.7` support. However, you can still use this package with `Python 3.7` by installing with `--ignore-requires-python` option.
 
@@ -93,82 +94,83 @@ from pytorch_optimizer import get_supported_optimizers
 supported_optimizers = get_supported_optimizers()
 ```
 
-| Optimizer     | Description                                                                                       | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
-|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                         | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
-| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                   | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
-| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                         | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
-| AdamD         | *Improved bias-correction in Adam*                                                                |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
-| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                    | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
-| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                        | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
-| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                          | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
-| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                        | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
-| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*              | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
-| Ranger21      | *a synergistic deep learning optimizer*                                                           | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
-| Lamb          | *Large Batch Optimization for Deep Learning*                                                      | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
-| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                   | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
-| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                     | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
-| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                          | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
-| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                                | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
-| SAM           | *Sharpness-Aware Minimization*                                                                    | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
-| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                           | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
-| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                               | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
-| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                     | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
-| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                              | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
-| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*   | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
-| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*      | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
-| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                   | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
-| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                        | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
-| SM3           | *Memory-Efficient Adaptive Optimization*                                                          | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
-| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                      | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
-| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                   | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
-| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                    | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
-| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                           | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
-| SGDW          | *Decoupled Weight Decay Regularization*                                                           | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
-| ASGD          | *Adaptive Gradient Descent without Descent*                                                       | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
-| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                     |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
-| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                              |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
-| Fromage       | *On the distance between two neural networks and the stability of learning*                       | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
-| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                       | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
-| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                   | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
-| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                          | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
-| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                            | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
-| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                          | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
-| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                           | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
-| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                       |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
+| Optimizer     | Description                                                                                      | Official Code                                                                                                  | Paper                                                                                      | Citation                                                                                                                            |
+|---------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| AdaBelief     | *Adapting Step-sizes by the Belief in Observed Gradients*                                        | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer)                                                | <https://arxiv.org/abs/2010.07468>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation)                                                        |
+| AdaBound      | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate*                                  | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py)                                   | <https://openreview.net/forum?id=Bkg3g2R9FX>                                               | [cite](https://github.com/Luolc/AdaBound#citing)                                                                                    |
+| AdaHessian    | *An Adaptive Second Order Optimizer for Machine Learning*                                        | [github](https://github.com/amirgholami/adahessian)                                                            | <https://arxiv.org/abs/2006.00719>                                                         | [cite](https://github.com/amirgholami/adahessian#citation)                                                                          |
+| AdamD         | *Improved bias-correction in Adam*                                                               |                                                                                                                | <https://arxiv.org/abs/2110.10828>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation)                                                        |
+| AdamP         | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*                   | [github](https://github.com/clovaai/AdamP)                                                                     | <https://arxiv.org/abs/2006.08217>                                                         | [cite](https://github.com/clovaai/AdamP#how-to-cite)                                                                                |
+| diffGrad      | *An Optimization Method for Convolutional Neural Networks*                                       | [github](https://github.com/shivram1987/diffGrad)                                                              | <https://arxiv.org/abs/1909.11015v3>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation)                                                        |
+| MADGRAD       | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic*                         | [github](https://github.com/facebookresearch/madgrad)                                                          | <https://arxiv.org/abs/2101.11075>                                                         | [cite](https://github.com/facebookresearch/madgrad#tech-report)                                                                     |
+| RAdam         | *On the Variance of the Adaptive Learning Rate and Beyond*                                       | [github](https://github.com/LiyuanLucasLiu/RAdam)                                                              | <https://arxiv.org/abs/1908.03265>                                                         | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation)                                                                            |
+| Ranger        | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer*             | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer)                                          | <https://bit.ly/3zyspC3>                                                                   | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work)                                                |
+| Ranger21      | *a synergistic deep learning optimizer*                                                          | [github](https://github.com/lessw2020/Ranger21)                                                                | <https://arxiv.org/abs/2106.13731>                                                         | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work)                                                                 |
+| Lamb          | *Large Batch Optimization for Deep Learning*                                                     | [github](https://github.com/cybertronai/pytorch-lamb)                                                          | <https://arxiv.org/abs/1904.00962>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation)                                                        |
+| Shampoo       | *Preconditioned Stochastic Tensor Optimization*                                                  | [github](https://github.com/moskomule/shampoo.pytorch)                                                         | <https://arxiv.org/abs/1802.09568>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation)                                                        |
+| Nero          | *Learning by Turning: Neural Architecture Aware Optimisation*                                    | [github](https://github.com/jxbz/nero)                                                                         | <https://arxiv.org/abs/2102.07227>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation)                                                        |
+| Adan          | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models*                         | [github](https://github.com/sail-sg/Adan)                                                                      | <https://arxiv.org/abs/2208.06677>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation)                                                        |
+| Adai          | *Disentangling the Effects of Adaptive Learning Rate and Momentum*                               | [github](https://github.com/zeke-xie/adaptive-inertia-adai)                                                    | <https://arxiv.org/abs/2006.15815>                                                         | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing)                                                                    |
+| SAM           | *Sharpness-Aware Minimization*                                                                   | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2010.01412>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation)                                                        |
+| ASAM          | *Adaptive Sharpness-Aware Minimization*                                                          | [github](https://github.com/davda54/sam)                                                                       | <https://arxiv.org/abs/2102.11600>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation)                                                        |
+| GSAM          | *Surrogate Gap Guided Sharpness-Aware Minimization*                                              | [github](https://github.com/juntang-zhuang/GSAM)                                                               | <https://openreview.net/pdf?id=edONMAnhLu->                                                | [cite](https://github.com/juntang-zhuang/GSAM#citation)                                                                             |
+| D-Adaptation  | *Learning-Rate-Free Learning by D-Adaptation*                                                    | [github](https://github.com/facebookresearch/dadaptation)                                                      | <https://arxiv.org/abs/2301.07733>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation)                                                        |
+| AdaFactor     | *Adaptive Learning Rates with Sublinear Memory Cost*                                             | [github](https://github.com/DeadAt0m/adafactor-pytorch)                                                        | <https://arxiv.org/abs/1804.04235>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation)                                                        |
+| Apollo        | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization*  | [github](https://github.com/XuezheMax/apollo)                                                                  | <https://arxiv.org/abs/2009.13586>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation)                                                        |
+| NovoGrad      | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks*     | [github](https://github.com/lonePatient/NovoGrad-pytorch)                                                      | <https://arxiv.org/abs/1905.11286>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation)                                                        |
+| Lion          | *Symbolic Discovery of Optimization Algorithms*                                                  | [github](https://github.com/google/automl/tree/master/lion)                                                    | <https://arxiv.org/abs/2302.06675>                                                         | [cite](https://github.com/google/automl/tree/master/lion#citation)                                                                  |
+| Ali-G         | *Adaptive Learning Rates for Interpolation with Gradients*                                       | [github](https://github.com/oval-group/ali-g)                                                                  | <https://arxiv.org/abs/1906.05661>                                                         | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients)                                |
+| SM3           | *Memory-Efficient Adaptive Optimization*                                                         | [github](https://github.com/google-research/google-research/tree/master/sm3)                                   | <https://arxiv.org/abs/1901.11150>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation)                                                        |
+| AdaNorm       | *Adaptive Gradient Norm Correction based Optimizer for CNNs*                                     | [github](https://github.com/shivram1987/AdaNorm)                                                               | <https://arxiv.org/abs/2210.06364>                                                         | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation)                                                                   |
+| RotoGrad      | *Gradient Homogenization in Multitask Learning*                                                  | [github](https://github.com/adrianjav/rotograd)                                                                | <https://openreview.net/pdf?id=T8wHz4rnuGL>                                                | [cite](https://github.com/adrianjav/rotograd#citing)                                                                                |
+| A2Grad        | *Optimal Adaptive and Accelerated Stochastic Gradient Descent*                                   | [github](https://github.com/severilov/A2Grad_optimizer)                                                        | <https://arxiv.org/abs/1810.00553>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation)                                                        |
+| AccSGD        | *Accelerating Stochastic Gradient Descent For Least Squares Regression*                          | [github](https://github.com/rahulkidambi/AccSGD)                                                               | <https://arxiv.org/abs/1704.08227>                                                         | [cite](https://github.com/rahulkidambi/AccSGD#citation)                                                                             |
+| SGDW          | *Decoupled Weight Decay Regularization*                                                          | [github](https://github.com/loshchil/AdamW-and-SGDW)                                                           | <https://arxiv.org/abs/1711.05101>                                                         | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact)                                                                          |
+| ASGD          | *Adaptive Gradient Descent without Descent*                                                      | [github](https://github.com/ymalitsky/adaptive_GD)                                                             | <https://arxiv.org/abs/1910.09529>                                                         | [cite](https://github.com/ymalitsky/adaptive_GD#reference)                                                                          |
+| Yogi          | *Adaptive Methods for Nonconvex Optimization*                                                    |                                                                                                                | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html)                   |
+| SWATS         | *Improving Generalization Performance by Switching from Adam to SGD*                             |                                                                                                                | <https://arxiv.org/abs/1712.07628>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation)                                                        |
+| Fromage       | *On the distance between two neural networks and the stability of learning*                      | [github](https://github.com/jxbz/fromage)                                                                      | <https://arxiv.org/abs/2002.03432>                                                         | [cite](https://github.com/jxbz/fromage#citation)                                                                                    |
+| MSVAG         | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients*                      | [github](https://github.com/lballes/msvag)                                                                     | <https://arxiv.org/abs/1705.07774>                                                         | [cite](https://github.com/lballes/msvag#citation)                                                                                   |
+| AdaMod        | *An Adaptive and Momental Bound Method for Stochastic Learning*                                  | [github](https://github.com/lancopku/AdaMod)                                                                   | <https://arxiv.org/abs/1910.12249>                                                         | [cite](https://github.com/lancopku/AdaMod#citation)                                                                                 |
+| AggMo         | *Aggregated Momentum: Stability Through Passive Damping*                                         | [github](https://github.com/AtheMathmo/AggMo)                                                                  | <https://arxiv.org/abs/1804.00325>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation)                                                        |
+| QHAdam        | *Quasi-hyperbolic momentum and Adam for deep learning*                                           | [github](https://github.com/facebookresearch/qhoptim)                                                          | <https://arxiv.org/abs/1810.06801>                                                         | [cite](https://github.com/facebookresearch/qhoptim#reference)                                                                       |
+| PID           | *A PID Controller Approach for Stochastic Optimization of Deep Networks*                         | [github](https://github.com/tensorboy/PIDOptimizer)                                                            | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf)                    | [cite](https://github.com/tensorboy/PIDOptimizer#citation)                                                                          |
+| Gravity       | *a Kinematic Approach on Optimization in Deep Learning*                                          | [github](https://github.com/dariush-bahrami/gravity.optimizer)                                                 | <https://arxiv.org/abs/2101.09192>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation)                                                        |
+| AdaSmooth     | *An Adaptive Learning Rate Method based on Effective Ratio*                                      |                                                                                                                | <https://arxiv.org/abs/2204.00825v1>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation)                                                        |
 | SRMM          | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM)                                                                   | <https://arxiv.org/abs/2201.01652>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation)                                                        |
-| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                                | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
-| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                        | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
-| AMSGrad       | *On the Convergence of Adam and Beyond*                                                           |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
-| Lookahead     | *k steps forward, 1 step back*                                                                    | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
-| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                                | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
-| GC            | *Gradient Centralization*                                                                         | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
-| AGC           | *Adaptive Gradient Clipping*                                                                      | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
-| Stable WD     | *Understanding and Scheduling Weight Decay*                                                       | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
-| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
-| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
-| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
-| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
-| AdaDelta      | *An Adaptive Learning Rate Method*                                                                |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
-| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
-| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                 | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
-| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
-| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                                | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
-| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*    | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
-| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
-| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                       | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
-| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                        | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
-| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*             | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
-| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*      | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
-| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                   | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
-| Adalite       | *Adalite optimizer*                                                                               | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
-| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                           | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
-| Schedule-Free | *Schedule-Free Optimizers*                                                                        | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
-| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*                | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
-| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                               | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
-| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                     | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
-| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                        |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
-| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                           | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
+| AvaGrad       | *Domain-independent Dominance of Adaptive Methods*                                               | [github](https://github.com/lolemacs/avagrad)                                                                  | <https://arxiv.org/abs/1912.01823>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation)                                                        |
+| PCGrad        | *Gradient Surgery for Multi-Task Learning*                                                       | [github](https://github.com/tianheyu927/PCGrad)                                                                | <https://arxiv.org/abs/2001.06782>                                                         | [cite](https://github.com/tianheyu927/PCGrad#reference)                                                                             |
+| AMSGrad       | *On the Convergence of Adam and Beyond*                                                          |                                                                                                                | <https://openreview.net/pdf?id=ryQu7f-RZ>                                                  | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation)                                                        |
+| Lookahead     | *k steps forward, 1 step back*                                                                   | [github](https://github.com/pytorch/examples/tree/main/imagenet)                                               | <https://arxiv.org/abs/1907.08610>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation)                                                        |
+| PNM           | *Manipulating Stochastic Gradient Noise to Improve Generalization*                               | [github](https://github.com/zeke-xie/Positive-Negative-Momentum)                                               | <https://arxiv.org/abs/2103.17182>                                                         | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing)                                                               |
+| GC            | *Gradient Centralization*                                                                        | [github](https://github.com/Yonghongwei/Gradient-Centralization)                                               | <https://arxiv.org/abs/2004.01461>                                                         | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation)                                                             |
+| AGC           | *Adaptive Gradient Clipping*                                                                     | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets)                                     | <https://arxiv.org/abs/2102.06171>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation)                                                        |
+| Stable WD     | *Understanding and Scheduling Weight Decay*                                                      | [github](https://github.com/zeke-xie/stable-weight-decay-regularization)                                       | <https://arxiv.org/abs/2011.11152>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation)                                                        |
+| Softplus T    | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                          |                                                                                                                | <https://arxiv.org/abs/1908.00700>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation)                                                        |
+| Un-tuned w/u  | *On the adequacy of untuned warmup for adaptive optimization*                                    |                                                                                                                | <https://arxiv.org/abs/1910.04209>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation)                                                        |
+| Norm Loss     | *An efficient yet effective regularization method for deep neural networks*                      |                                                                                                                | <https://arxiv.org/abs/2103.06583>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation)                                                        |
+| AdaShift      | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                | [github](https://github.com/MichaelKonobeev/adashift)                                                          | <https://arxiv.org/abs/1810.00143v4>                                                       | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation)                                                        |
+| AdaDelta      | *An Adaptive Learning Rate Method*                                                               |                                                                                                                | <https://arxiv.org/abs/1212.5701v1>                                                        | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation)                                                        |
+| Amos          | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                | [github](https://github.com/google-research/jestimator)                                                        | <https://arxiv.org/abs/2210.11693>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation)                                                        |
+| SignSGD       | *Compressed Optimisation for Non-Convex Problems*                                                | [github](https://github.com/jxbz/signSGD)                                                                      | <https://arxiv.org/abs/1802.04434>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation)                                                        |
+| Sophia        | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                   | [github](https://github.com/Liuhong99/Sophia)                                                                  | <https://arxiv.org/abs/2305.14342>                                                         | [cite](https://github.com/Liuhong99/Sophia)                                                                                         |
+| Prodigy       | *An Expeditiously Adaptive Parameter-Free Learner*                                               | [github](https://github.com/konstmish/prodigy)                                                                 | <https://arxiv.org/abs/2306.06101>                                                         | [cite](https://github.com/konstmish/prodigy#how-to-cite)                                                                            |
+| PAdam         | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks*   | [github](https://github.com/uclaml/Padam)                                                                      | <https://arxiv.org/abs/1806.06763>                                                         | [cite](https://github.com/uclaml/Padam#citation)                                                                                    |
+| LOMO          | *Full Parameter Fine-tuning for Large Language Models with Limited Resources*                    | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2306.09782>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| AdaLOMO       | *Low-memory Optimization with Adaptive Learning Rate*                     | [github](https://github.com/OpenLMLab/LOMO)                                                                    | <https://arxiv.org/abs/2310.10195>                                                         | [cite](https://github.com/OpenLMLab/LOMO#citation)                                                                                  |
+| Tiger         | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious*                      | [github](https://github.com/bojone/tiger)                                                                      |                                                                                            | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation)                                                             |
+| CAME          | *Confidence-guided Adaptive Memory Efficient Optimization*                                       | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME)                            | <https://aclanthology.org/2023.acl-long.243/>                                              | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation)                                          |
+| WSAM          | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term*            | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | <https://arxiv.org/abs/2305.15817>                                                         | [cite](https://github.com/intelligent-machine-learning/dlrover)                                                                     |
+| Aida          | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range*     | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer)                                                   | <https://arxiv.org/abs/2203.13273>                                                         | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida)                           |
+| GaLore        | *Memory-Efficient LLM Training by Gradient Low-Rank Projection*                                  | [github](https://github.com/jiaweizzhao/GaLore)                                                                | <https://arxiv.org/abs/2403.03507>                                                         | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation)                                               |
+| Adalite       | *Adalite optimizer*                                                                              | [github](https://github.com/VatsaDev/adalite)                                                                  | <https://github.com/VatsaDev/adalite>                                                      | [cite](https://github.com/VatsaDev/adalite)                                                                                         |
+| bSAM          | *SAM as an Optimal Relaxation of Bayes*                                                          | [github](https://github.com/team-approx-bayes/bayesian-sam)                                                    | <https://arxiv.org/abs/2210.01620>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation)                                                        |
+| Schedule-Free | *Schedule-Free Optimizers*                                                                       | [github](https://github.com/facebookresearch/schedule_free)                                                    | <https://github.com/facebookresearch/schedule_free>                                        | [cite](https://github.com/facebookresearch/schedule_free)                                                                           |
+| FAdam         | *Adam is a natural gradient optimizer using diagonal empirical Fisher information*               | [github](https://github.com/lessw2020/fadam_pytorch)                                                           | <https://arxiv.org/abs/2405.12807>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240512807H/exportcitation)                                                        |
+| Grokfast      | *Accelerated Grokking by Amplifying Slow Gradients*                                              | [github](https://github.com/ironjr/grokfast)                                                                   | <https://arxiv.org/abs/2405.20233>                                                         | [cite](https://github.com/ironjr/grokfast?tab=readme-ov-file#citation)                                                              |
+| Kate          | *Remove that Square Root: A New Efficient Scale-Invariant Version of AdaGrad*                    | [github](https://github.com/nazya/KATE)                                                                        | <https://arxiv.org/abs/2403.02648>                                                         | [cite](https://github.com/nazya/KATE?tab=readme-ov-file#remove-that-square-root-a-new-efficient-scale-invariant-version-of-adagrad) |
+| StableAdamW   | *Stable and low-precision training for large-scale vision-language models*                       |                                                                                                                | <https://arxiv.org/abs/2304.13013>                                                         | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230413013W/exportcitation)                                                        |
+| AdamMini      | *Use Fewer Learning Rates To Gain More*                                                          | [github](https://github.com/zyushun/Adam-mini)                                                                 | <https://arxiv.org/abs/2406.16793>                                                         | [cite](https://github.com/zyushun/Adam-mini?tab=readme-ov-file#citation)                                                            |
 
 ## Supported LR Scheduler
 
@@ -318,7 +320,11 @@ Correcting the norm of a gradient in each iteration based on the adaptive traini
 
 ## Frequently asked questions
 
-[here](./qa.md)
+[here](docs/qa.md)
+
+## Visualization
+
+[here](docs/visualization.md)
 
 ## Citation
 
diff --git a/docs/optimizer.md b/docs/optimizer.md
index 6c22e0f0d..bd4f4e6ac 100644
--- a/docs/optimizer.md
+++ b/docs/optimizer.md
@@ -24,6 +24,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.AdaLOMO
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.Adai
     :docstring:
     :members:
diff --git a/examples/visualize_optimizers.py b/examples/visualize_optimizers.py
index 9ebe50ccc..e13f8e991 100644
--- a/examples/visualize_optimizers.py
+++ b/examples/visualize_optimizers.py
@@ -158,7 +158,7 @@ def main():
     ]
 
     for optimizer_name, optimizer in OPTIMIZERS.items():
-        if optimizer_name.lower() in {'alig', 'lomo', 'bsam', 'adammini'}:
+        if optimizer_name.lower() in {'alig', 'lomo', 'adalomo', 'bsam', 'adammini'}:
             continue
 
         optimizers.append((optimizer, -6, 0.2))
diff --git a/poetry.lock b/poetry.lock
index da0f176fa..500416372 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,24 +1,5 @@
 # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
-[[package]]
-name = "bitsandbytes"
-version = "0.43.1"
-description = "k-bit optimizers and matrix multiplication routines."
-optional = true
-python-versions = "*"
-files = [
-    {file = "bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3"},
-    {file = "bitsandbytes-0.43.1-py3-none-win_amd64.whl", hash = "sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd"},
-]
-
-[package.dependencies]
-numpy = "*"
-torch = "*"
-
-[package.extras]
-benchmark = ["matplotlib", "pandas"]
-test = ["scipy"]
-
 [[package]]
 name = "black"
 version = "24.4.2"
@@ -92,63 +73,63 @@ files = [
 
 [[package]]
 name = "coverage"
-version = "7.5.4"
+version = "7.6.0"
 description = "Code coverage measurement for Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "coverage-7.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cfb5a4f556bb51aba274588200a46e4dd6b505fb1a5f8c5ae408222eb416f99"},
-    {file = "coverage-7.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2174e7c23e0a454ffe12267a10732c273243b4f2d50d07544a91198f05c48f47"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2214ee920787d85db1b6a0bd9da5f8503ccc8fcd5814d90796c2f2493a2f4d2e"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1137f46adb28e3813dec8c01fefadcb8c614f33576f672962e323b5128d9a68d"},
-    {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b385d49609f8e9efc885790a5a0e89f2e3ae042cdf12958b6034cc442de428d3"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b4a474f799456e0eb46d78ab07303286a84a3140e9700b9e154cfebc8f527016"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5cd64adedf3be66f8ccee418473c2916492d53cbafbfcff851cbec5a8454b136"},
-    {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e564c2cf45d2f44a9da56f4e3a26b2236504a496eb4cb0ca7221cd4cc7a9aca9"},
-    {file = "coverage-7.5.4-cp310-cp310-win32.whl", hash = "sha256:7076b4b3a5f6d2b5d7f1185fde25b1e54eb66e647a1dfef0e2c2bfaf9b4c88c8"},
-    {file = "coverage-7.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:018a12985185038a5b2bcafab04ab833a9a0f2c59995b3cec07e10074c78635f"},
-    {file = "coverage-7.5.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:db14f552ac38f10758ad14dd7b983dbab424e731588d300c7db25b6f89e335b5"},
-    {file = "coverage-7.5.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3257fdd8e574805f27bb5342b77bc65578e98cbc004a92232106344053f319ba"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a6612c99081d8d6134005b1354191e103ec9705d7ba2754e848211ac8cacc6b"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45d3cbd94159c468b9b8c5a556e3f6b81a8d1af2a92b77320e887c3e7a5d080"},
-    {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a892be37ca35eb5019ec85402c3371b0f7cda5ab5056023a7f13da0961e60da"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8192794d120167e2a64721d88dbd688584675e86e15d0569599257566dec9bf0"},
-    {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:820bc841faa502e727a48311948e0461132a9c8baa42f6b2b84a29ced24cc078"},
-    {file = "coverage-7.5.4-cp311-cp311-win32.whl", hash = "sha256:6aae5cce399a0f065da65c7bb1e8abd5c7a3043da9dceb429ebe1b289bc07806"},
-    {file = "coverage-7.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2e344d6adc8ef81c5a233d3a57b3c7d5181f40e79e05e1c143da143ccb6377d"},
-    {file = "coverage-7.5.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:54317c2b806354cbb2dc7ac27e2b93f97096912cc16b18289c5d4e44fc663233"},
-    {file = "coverage-7.5.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:042183de01f8b6d531e10c197f7f0315a61e8d805ab29c5f7b51a01d62782747"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6bb74ed465d5fb204b2ec41d79bcd28afccf817de721e8a807d5141c3426638"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3d45ff86efb129c599a3b287ae2e44c1e281ae0f9a9bad0edc202179bcc3a2e"},
-    {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1014fbf665fef86cdfd6cb5b7371496ce35e4d2a00cda501cf9f5b9e6fced69f"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3684bc2ff328f935981847082ba4fdc950d58906a40eafa93510d1b54c08a66c"},
-    {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:581ea96f92bf71a5ec0974001f900db495488434a6928a2ca7f01eee20c23805"},
-    {file = "coverage-7.5.4-cp312-cp312-win32.whl", hash = "sha256:73ca8fbc5bc622e54627314c1a6f1dfdd8db69788f3443e752c215f29fa87a0b"},
-    {file = "coverage-7.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:cef4649ec906ea7ea5e9e796e68b987f83fa9a718514fe147f538cfeda76d7a7"},
-    {file = "coverage-7.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdd31315fc20868c194130de9ee6bfd99755cc9565edff98ecc12585b90be882"},
-    {file = "coverage-7.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:02ff6e898197cc1e9fa375581382b72498eb2e6d5fc0b53f03e496cfee3fac6d"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d05c16cf4b4c2fc880cb12ba4c9b526e9e5d5bb1d81313d4d732a5b9fe2b9d53"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5986ee7ea0795a4095ac4d113cbb3448601efca7f158ec7f7087a6c705304e4"},
-    {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df54843b88901fdc2f598ac06737f03d71168fd1175728054c8f5a2739ac3e4"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ab73b35e8d109bffbda9a3e91c64e29fe26e03e49addf5b43d85fc426dde11f9"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:aea072a941b033813f5e4814541fc265a5c12ed9720daef11ca516aeacd3bd7f"},
-    {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:16852febd96acd953b0d55fc842ce2dac1710f26729b31c80b940b9afcd9896f"},
-    {file = "coverage-7.5.4-cp38-cp38-win32.whl", hash = "sha256:8f894208794b164e6bd4bba61fc98bf6b06be4d390cf2daacfa6eca0a6d2bb4f"},
-    {file = "coverage-7.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:e2afe743289273209c992075a5a4913e8d007d569a406ffed0bd080ea02b0633"},
-    {file = "coverage-7.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b95c3a8cb0463ba9f77383d0fa8c9194cf91f64445a63fc26fb2327e1e1eb088"},
-    {file = "coverage-7.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7564cc09dd91b5a6001754a5b3c6ecc4aba6323baf33a12bd751036c998be4"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44da56a2589b684813f86d07597fdf8a9c6ce77f58976727329272f5a01f99f7"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e16f3d6b491c48c5ae726308e6ab1e18ee830b4cdd6913f2d7f77354b33f91c8"},
-    {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a04e990a2a41740b02d6182b498ee9796cf60eefe40cf859b016650147908029"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ddbd2f9713a79e8e7242d7c51f1929611e991d855f414ca9996c20e44a895f7c"},
-    {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b1ccf5e728ccf83acd313c89f07c22d70d6c375a9c6f339233dcf792094bcbf7"},
-    {file = "coverage-7.5.4-cp39-cp39-win32.whl", hash = "sha256:56b4eafa21c6c175b3ede004ca12c653a88b6f922494b023aeb1e836df953ace"},
-    {file = "coverage-7.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:65e528e2e921ba8fd67d9055e6b9f9e34b21ebd6768ae1c1723f4ea6ace1234d"},
-    {file = "coverage-7.5.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:79b356f3dd5b26f3ad23b35c75dbdaf1f9e2450b6bcefc6d0825ea0aa3f86ca5"},
-    {file = "coverage-7.5.4.tar.gz", hash = "sha256:a44963520b069e12789d0faea4e9fdb1e410cdc4aab89d94f7f55cbb7fef0353"},
+    {file = "coverage-7.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dff044f661f59dace805eedb4a7404c573b6ff0cdba4a524141bc63d7be5c7fd"},
+    {file = "coverage-7.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8659fd33ee9e6ca03950cfdcdf271d645cf681609153f218826dd9805ab585c"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7792f0ab20df8071d669d929c75c97fecfa6bcab82c10ee4adb91c7a54055463"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4b3cd1ca7cd73d229487fa5caca9e4bc1f0bca96526b922d61053ea751fe791"},
+    {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7e128f85c0b419907d1f38e616c4f1e9f1d1b37a7949f44df9a73d5da5cd53c"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a94925102c89247530ae1dab7dc02c690942566f22e189cbd53579b0693c0783"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dcd070b5b585b50e6617e8972f3fbbee786afca71b1936ac06257f7e178f00f6"},
+    {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d50a252b23b9b4dfeefc1f663c568a221092cbaded20a05a11665d0dbec9b8fb"},
+    {file = "coverage-7.6.0-cp310-cp310-win32.whl", hash = "sha256:0e7b27d04131c46e6894f23a4ae186a6a2207209a05df5b6ad4caee6d54a222c"},
+    {file = "coverage-7.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:54dece71673b3187c86226c3ca793c5f891f9fc3d8aa183f2e3653da18566169"},
+    {file = "coverage-7.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7b525ab52ce18c57ae232ba6f7010297a87ced82a2383b1afd238849c1ff933"},
+    {file = "coverage-7.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bea27c4269234e06f621f3fac3925f56ff34bc14521484b8f66a580aacc2e7d"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed8d1d1821ba5fc88d4a4f45387b65de52382fa3ef1f0115a4f7a20cdfab0e94"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01c322ef2bbe15057bc4bf132b525b7e3f7206f071799eb8aa6ad1940bcf5fb1"},
+    {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03cafe82c1b32b770a29fd6de923625ccac3185a54a5e66606da26d105f37dac"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d1b923fc4a40c5832be4f35a5dab0e5ff89cddf83bb4174499e02ea089daf57"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4b03741e70fb811d1a9a1d75355cf391f274ed85847f4b78e35459899f57af4d"},
+    {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a73d18625f6a8a1cbb11eadc1d03929f9510f4131879288e3f7922097a429f63"},
+    {file = "coverage-7.6.0-cp311-cp311-win32.whl", hash = "sha256:65fa405b837060db569a61ec368b74688f429b32fa47a8929a7a2f9b47183713"},
+    {file = "coverage-7.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:6379688fb4cfa921ae349c76eb1a9ab26b65f32b03d46bb0eed841fd4cb6afb1"},
+    {file = "coverage-7.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f7db0b6ae1f96ae41afe626095149ecd1b212b424626175a6633c2999eaad45b"},
+    {file = "coverage-7.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bbdf9a72403110a3bdae77948b8011f644571311c2fb35ee15f0f10a8fc082e8"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc44bf0315268e253bf563f3560e6c004efe38f76db03a1558274a6e04bf5d5"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da8549d17489cd52f85a9829d0e1d91059359b3c54a26f28bec2c5d369524807"},
+    {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0086cd4fc71b7d485ac93ca4239c8f75732c2ae3ba83f6be1c9be59d9e2c6382"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fad32ee9b27350687035cb5fdf9145bc9cf0a094a9577d43e909948ebcfa27b"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:044a0985a4f25b335882b0966625270a8d9db3d3409ddc49a4eb00b0ef5e8cee"},
+    {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:76d5f82213aa78098b9b964ea89de4617e70e0d43e97900c2778a50856dac605"},
+    {file = "coverage-7.6.0-cp312-cp312-win32.whl", hash = "sha256:3c59105f8d58ce500f348c5b56163a4113a440dad6daa2294b5052a10db866da"},
+    {file = "coverage-7.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca5d79cfdae420a1d52bf177de4bc2289c321d6c961ae321503b2ca59c17ae67"},
+    {file = "coverage-7.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d39bd10f0ae453554798b125d2f39884290c480f56e8a02ba7a6ed552005243b"},
+    {file = "coverage-7.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beb08e8508e53a568811016e59f3234d29c2583f6b6e28572f0954a6b4f7e03d"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2e16f4cd2bc4d88ba30ca2d3bbf2f21f00f382cf4e1ce3b1ddc96c634bc48ca"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6616d1c9bf1e3faea78711ee42a8b972367d82ceae233ec0ac61cc7fec09fa6b"},
+    {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad4567d6c334c46046d1c4c20024de2a1c3abc626817ae21ae3da600f5779b44"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d17c6a415d68cfe1091d3296ba5749d3d8696e42c37fca5d4860c5bf7b729f03"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9146579352d7b5f6412735d0f203bbd8d00113a680b66565e205bc605ef81bc6"},
+    {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cdab02a0a941af190df8782aafc591ef3ad08824f97850b015c8c6a8b3877b0b"},
+    {file = "coverage-7.6.0-cp38-cp38-win32.whl", hash = "sha256:df423f351b162a702c053d5dddc0fc0ef9a9e27ea3f449781ace5f906b664428"},
+    {file = "coverage-7.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:f2501d60d7497fd55e391f423f965bbe9e650e9ffc3c627d5f0ac516026000b8"},
+    {file = "coverage-7.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7221f9ac9dad9492cecab6f676b3eaf9185141539d5c9689d13fd6b0d7de840c"},
+    {file = "coverage-7.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ddaaa91bfc4477d2871442bbf30a125e8fe6b05da8a0015507bfbf4718228ab2"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cbe651f3904e28f3a55d6f371203049034b4ddbce65a54527a3f189ca3b390"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:831b476d79408ab6ccfadaaf199906c833f02fdb32c9ab907b1d4aa0713cfa3b"},
+    {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46c3d091059ad0b9c59d1034de74a7f36dcfa7f6d3bde782c49deb42438f2450"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4d5fae0a22dc86259dee66f2cc6c1d3e490c4a1214d7daa2a93d07491c5c04b6"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:07ed352205574aad067482e53dd606926afebcb5590653121063fbf4e2175166"},
+    {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:49c76cdfa13015c4560702574bad67f0e15ca5a2872c6a125f6327ead2b731dd"},
+    {file = "coverage-7.6.0-cp39-cp39-win32.whl", hash = "sha256:482855914928c8175735a2a59c8dc5806cf7d8f032e4820d52e845d1f731dca2"},
+    {file = "coverage-7.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:543ef9179bc55edfd895154a51792b01c017c87af0ebaae092720152e19e42ca"},
+    {file = "coverage-7.6.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:6fe885135c8a479d3e37a7aae61cbd3a0fb2deccb4dda3c25f92a49189f766d6"},
+    {file = "coverage-7.6.0.tar.gz", hash = "sha256:289cc803fa1dc901f84701ac10c9ee873619320f2f9aff38794db4a4a0268d51"},
 ]
 
 [package.dependencies]
@@ -159,13 +140,13 @@ toml = ["tomli"]
 
 [[package]]
 name = "exceptiongroup"
-version = "1.2.1"
+version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"},
-    {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"},
+    {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
+    {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
 ]
 
 [package.extras]
@@ -574,17 +555,20 @@ files = [
 
 [[package]]
 name = "sympy"
-version = "1.12.1"
+version = "1.13.0"
 description = "Computer algebra system (CAS) in Python"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.12.1-py3-none-any.whl", hash = "sha256:9b2cbc7f1a640289430e13d2a56f02f867a1da0190f2f99d8968c2f74da0e515"},
-    {file = "sympy-1.12.1.tar.gz", hash = "sha256:2877b03f998cd8c08f07cd0de5b767119cd3ef40d09f41c30d722f6686b0fb88"},
+    {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"},
+    {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"},
 ]
 
 [package.dependencies]
-mpmath = ">=1.1.0,<1.4.0"
+mpmath = ">=1.1.0,<1.4"
+
+[package.extras]
+dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
 [[package]]
 name = "tbb"
@@ -659,9 +643,9 @@ files = [
 ]
 
 [extras]
-bitsandbytes = ["bitsandbytes"]
+bitsandbytes = []
 
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8,<4.0.0"
-content-hash = "d51586f8352db14a18dd407b19285c9649564b029e6e6aae52a0d566515e5c81"
+content-hash = "6b1f5dc9eb871995523a04d9544e6036d24ec0c3b355375514d237b91098f4ea"
diff --git a/pyproject.toml b/pyproject.toml
index 6c672ea9b..12f7c3809 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "3.0.2"
+version = "3.1.0"
 description = "optimizer & lr scheduler & objective function collections in PyTorch"
 license = "Apache-2.0"
 authors = ["kozistr <kozistr@gmail.com>"]
@@ -12,13 +12,14 @@ documentation = "https://pytorch-optimizers.readthedocs.io/en/latest"
 keywords = [
     "pytorch", "deep-learning", "optimizer", "lr scheduler", "A2Grad", "ASGD", "AccSGD", "AdaBelief", "AdaBound",
     "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "Adalite",
-    "AdamMini", "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM", "CAME",
-    "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "FAdam", "Fromage", "GaLore",
-    "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero",
-    "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad",
-    "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3",
-    "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine",
-    "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD",
+    "AdaLomo", "AdamMini", "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM",
+    "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "FAdam", "Fromage",
+    "GaLore", "Gravity", "GrokFast", "GSAM", "Kate", "Lamb", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG",
+    "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21",
+    "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD",
+    "SM3", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal",
+    "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge",
+    "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
@@ -46,7 +47,6 @@ classifiers = [
 python = ">=3.8,<4.0.0"
 numpy = { version = "*", python = ">=3.8" }
 torch = { version = ">=1.10", python = ">=3.8", source = "torch" }
-bitsandbytes = { version = "^0.43", optional = true }
 
 [tool.poetry.dev-dependencies]
 isort = { version = "^5", python = ">=3.8" }
@@ -55,9 +55,6 @@ ruff = "*"
 pytest = "*"
 pytest-cov = "*"
 
-[tool.poetry.extras]
-bitsandbytes = ["bitsandbytes"]
-
 [[tool.poetry.source]]
 name = "torch"
 url = "https://download.pytorch.org/whl/cpu"
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
index 1f4dd552d..3628bb74e 100644
--- a/pytorch_optimizer/__init__.py
+++ b/pytorch_optimizer/__init__.py
@@ -1,4 +1,5 @@
 # ruff: noqa
+from importlib.util import find_spec
 from typing import Dict, List
 
 import torch.cuda
@@ -72,7 +73,7 @@
 from pytorch_optimizer.optimizer.lamb import Lamb
 from pytorch_optimizer.optimizer.lars import LARS
 from pytorch_optimizer.optimizer.lion import Lion
-from pytorch_optimizer.optimizer.lomo import LOMO
+from pytorch_optimizer.optimizer.lomo import LOMO, AdaLOMO
 from pytorch_optimizer.optimizer.lookahead import Lookahead
 from pytorch_optimizer.optimizer.madgrad import MADGRAD
 from pytorch_optimizer.optimizer.msvag import MSVAG
@@ -126,12 +127,8 @@
 )
 from pytorch_optimizer.optimizer.yogi import Yogi
 
-try:
-    import bitsandbytes as bnb
-
-    HAS_BNB: bool = True  # pragma: no cover
-except ImportError:
-    HAS_BNB: bool = False
+HAS_BNB: bool = find_spec('bitsandbytes') is not None
+HAS_Q_GALORE: bool = find_spec('q-galore-torch') is not None
 
 OPTIMIZER_LIST: List[OPTIMIZER] = [
     AdaBelief,
@@ -205,6 +202,7 @@
     Kate,
     StableAdamW,
     AdamMini,
+    AdaLOMO,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
@@ -252,22 +250,58 @@
 
 def load_bnb_optimizer(optimizer: str) -> OPTIMIZER:  # pragma: no cover
     r"""load bnb optimizer instance."""
+    from bitsandbytes import optim
+
     if 'sgd8bit' in optimizer:
-        return bnb.optim.SGD8bit
+        return optim.SGD8bit
     if 'adam8bit' in optimizer:
-        return bnb.optim.Adam8bit
+        return optim.Adam8bit
+    if 'paged_adam8bit' in optimizer:
+        return optim.PagedAdam8bit
     if 'adamw8bit' in optimizer:
-        return bnb.optim.AdamW8bit
+        return optim.AdamW8bit
+    if 'paged_adamw8bit' in optimizer:
+        return optim.PagedAdamW8bit
     if 'lamb8bit' in optimizer:
-        return bnb.optim.LAMB8bit
+        return optim.LAMB8bit
     if 'lars8bit' in optimizer:
-        return bnb.optim.LARS8bit
+        return optim.LARS8bit
     if 'lion8bit' in optimizer:
-        return bnb.optim.Lion8bit
+        return optim.Lion8bit
     if 'adagrad8bit' in optimizer:
-        return bnb.optim.Adagrad8bit
+        return optim.Adagrad8bit
     if 'rmsprop8bit' in optimizer:
-        return bnb.optim.RMSprop8bit
+        return optim.RMSprop8bit
+    if 'adagrad32bit' in optimizer:
+        return optim.Adagrad32bit
+    if 'adam32bit' in optimizer:
+        return optim.Adam32bit
+    if 'paged_adam32bit' in optimizer:
+        return optim.PagedAdam32bit
+    if 'adamw32bit' in optimizer:
+        return optim.AdamW32bit
+    if 'lamb32bit' in optimizer:
+        return optim.LAMB32bit
+    if 'lars32bit' in optimizer:
+        return optim.LARS32bit
+    if 'lion32bit' in optimizer:
+        return optim.Lion32bit
+    if 'paged_lion32bit' in optimizer:
+        return optim.PagedLion32bit
+    if 'rmsprop32bit' in optimizer:
+        return optim.RMSprop32bit
+    if 'sgd32bit' in optimizer:
+        return optim.SGD32bit
+    raise NotImplementedError(f'[-] not implemented optimizer : {optimizer}')
+
+
+def load_q_galore_optimizer(optimizer: str) -> OPTIMIZER:  # pragma: no cover
+    r"""load Q-GaLore optimizer instance."""
+    import q_galore_torch
+
+    if 'adamw8bit' in optimizer:
+        return q_galore_torch.QGaLoreAdamW8bit
+
     raise NotImplementedError(f'[-] not implemented optimizer : {optimizer}')
 
 
@@ -277,7 +311,11 @@ def load_optimizer(optimizer: str) -> OPTIMIZER:
     if optimizer.startswith('bnb'):
         if HAS_BNB and torch.cuda.is_available():
             return load_bnb_optimizer(optimizer)  # pragma: no cover
-        raise ImportError(f'[-] bitsandbytes and CUDA required for bnb optimizers : {optimizer}')
+        raise ImportError(f'[-] bitsandbytes and CUDA required for the optimizer {optimizer}')
+    if optimizer.startswith('q_galore'):
+        if HAS_Q_GALORE and torch.cuda.is_available():
+            return load_q_galore_optimizer(optimizer)  # pragma: no cover
+        raise ImportError(f'[-] bitsandbytes, q-galore-torch, and CUDA required for the optimizer {optimizer}')
     if optimizer not in OPTIMIZERS:
         raise NotImplementedError(f'[-] not implemented optimizer : {optimizer}')
 
diff --git a/pytorch_optimizer/base/optimizer.py b/pytorch_optimizer/base/optimizer.py
index e729b13e7..655fba7e4 100644
--- a/pytorch_optimizer/base/optimizer.py
+++ b/pytorch_optimizer/base/optimizer.py
@@ -214,6 +214,22 @@ def get_adanorm_gradient(
 
         return grad * exp_grad_norm / grad_norm if exp_grad_norm > grad_norm else grad
 
+    @staticmethod
+    def get_rms(x: torch.Tensor) -> float:
+        r"""Get RMS."""
+        return x.norm(2) / math.sqrt(x.numel())
+
+    @staticmethod
+    def approximate_sq_grad(
+        exp_avg_sq_row: torch.Tensor,
+        exp_avg_sq_col: torch.Tensor,
+        output: torch.Tensor,
+    ) -> None:
+        r"""Get approximation of EMA of squared gradient."""
+        r_factor: torch.Tensor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
+        c_factor: torch.Tensor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
+        torch.mul(r_factor, c_factor, out=output)
+
     @staticmethod
     def validate_range(x: float, name: str, low: float, high: float, range_type: str = '[)') -> None:
         if range_type == '[)' and not low <= x < high:
diff --git a/pytorch_optimizer/optimizer/adafactor.py b/pytorch_optimizer/optimizer/adafactor.py
index cc7b0fa35..9299cde12 100644
--- a/pytorch_optimizer/optimizer/adafactor.py
+++ b/pytorch_optimizer/optimizer/adafactor.py
@@ -127,22 +127,6 @@ def get_options(shape: Tuple[int, ...]) -> bool:
         r"""Get `factored`."""
         return len(shape) >= 2
 
-    @staticmethod
-    def get_rms(x: torch.Tensor) -> float:
-        r"""Get RMS."""
-        return x.norm(2) / math.sqrt(x.numel())
-
-    @staticmethod
-    def approximate_sq_grad(
-        exp_avg_sq_row: torch.Tensor,
-        exp_avg_sq_col: torch.Tensor,
-        output: torch.Tensor,
-    ):
-        r"""Get approximation of EMA of squared gradient."""
-        r_factor: torch.Tensor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1)
-        c_factor: torch.Tensor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
-        torch.mul(r_factor, c_factor, out=output)
-
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
         loss: LOSS = None
diff --git a/pytorch_optimizer/optimizer/adam_mini.py b/pytorch_optimizer/optimizer/adam_mini.py
index 67f732a65..043ae8439 100644
--- a/pytorch_optimizer/optimizer/adam_mini.py
+++ b/pytorch_optimizer/optimizer/adam_mini.py
@@ -1,5 +1,5 @@
 import math
-from typing import Optional
+from typing import Optional, Set
 
 import torch
 from torch import distributed as dist
@@ -57,6 +57,9 @@ def __init__(
         self.num_embeds = num_embeds
         self.num_heads = num_heads
 
+        self.embed_blocks: Set[str] = {'embed', 'embd', 'wte', 'lm_head.weight', 'output.weight'}
+        self.qk_blocks: Set[str] = {'k_proj.weight', 'q_proj.weight', 'wq.weight', 'wk.weight'}
+
         groups = self.get_optimizer_groups(weight_decay)
 
         defaults: DEFAULTS = {'lr': lr, 'betas': betas, 'eps': eps}
@@ -77,12 +80,7 @@ def get_optimizer_groups(self, weight_decay: float):
                 'weight_decay': 0.0 if ('norm' in name or 'ln_f' in name) else weight_decay,
             }
 
-            if (
-                'self_attn.k_proj.weight' in name
-                or 'self_attn.q_proj.weight' in name
-                or 'attn.wq.weight' in name
-                or 'attn.wk.weight' in name
-            ):
+            if any(block in name for block in self.qk_blocks):
                 group['parameter_per_head'] = self.num_embeds * self.num_embeds // self.num_heads
 
             if 'attn.attn.weight' in name or 'attn.qkv.weight' in name:
@@ -303,16 +301,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     fixed_decay=False,
                 )
 
-                if 'embed_tokens' in name or 'wte' in name or 'lm_head' in name:
+                if any(block in name for block in self.embed_blocks):
                     self.step_embed(
                         p, grad, state, group['lr'], beta1, beta2, bias_correction1, bias_correction2_sq, group['eps']
                     )
-                elif (
-                    'self_attn.k_proj.weight' in name
-                    or 'self_attn.q_proj.weight' in name
-                    or 'attn.wq.weight' in name
-                    or 'attn.wk.weight' in name
-                ):
+                elif any(block in name for block in self.qk_blocks):
                     self.step_attn_proj(
                         p,
                         grad,
diff --git a/pytorch_optimizer/optimizer/lomo.py b/pytorch_optimizer/optimizer/lomo.py
index 784998c94..16d7e7a1b 100644
--- a/pytorch_optimizer/optimizer/lomo.py
+++ b/pytorch_optimizer/optimizer/lomo.py
@@ -1,3 +1,4 @@
+import math
 import os
 from typing import Any, Callable, List, Optional
 
@@ -9,7 +10,7 @@
 from pytorch_optimizer.base.optimizer import BaseOptimizer
 from pytorch_optimizer.base.types import DEFAULTS
 from pytorch_optimizer.optimizer.fp16 import DynamicLossScaler
-from pytorch_optimizer.optimizer.utils import has_overflow
+from pytorch_optimizer.optimizer.utils import has_overflow, is_deepspeed_zero3_enabled
 
 
 class LOMO(BaseOptimizer, Optimizer):
@@ -199,3 +200,267 @@ def grad_norm(self, loss):
             self.clip_coef = torch.clamp(float(self.clip_grad_norm) / (total_norm + 1e-6), max=1.0)
 
         self.gather_norm = False
+
+
+class AdaLOMO(BaseOptimizer, Optimizer):
+    r"""Low-memory Optimization with Adaptive Learning Rate.
+
+    :param model: nn.Module. pytorch model.
+    :param lr: float. learning rate.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param loss_scale: float. loss scale.
+    :param clip_threshold: float. threshold of root-mean-square of final gradient update.
+    :param decay_rate: float. coefficient used to compute running averages of square gradient.
+    :param clip_grad_norm: Optional[float]. clip grad norm.
+    :param clip_grad_value: Optional[float]. clip grad value.
+    :param eps1: float. term added to the denominator to improve numerical stability.
+    :param eps2: float. term added to the denominator to improve numerical stability.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        lr: float = 1e-3,
+        weight_decay: float = 0.0,
+        loss_scale: float = 2.0 ** 10,
+        clip_threshold: float = 1.0,
+        decay_rate: float = -0.8,
+        clip_grad_norm: Optional[float] = None,
+        clip_grad_value: Optional[float] = None,
+        eps1: float = 1e-30,
+        eps2: float = 1e-3,
+    ) -> None:  # fmt: skip
+        self.validate_learning_rate(lr)
+        self.validate_non_negative(weight_decay, 'weight_decay')
+        self.validate_non_negative(loss_scale, 'loss_scale')
+        self.validate_non_negative(clip_threshold, 'clip_threshold')
+        self.validate_non_negative(clip_grad_norm, 'clip_grad_norm')
+        self.validate_non_negative(clip_grad_value, 'clip_grad_value')
+        self.validate_non_negative(eps1, 'eps1')
+        self.validate_non_negative(eps2, 'eps2')
+
+        self.model = model
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.loss_scale = loss_scale
+        self.clip_threshold = clip_threshold
+        self.decay_rate = decay_rate
+        self.clip_grad_norm = clip_grad_norm
+        self.clip_grad_value = clip_grad_value
+        self.eps1 = eps1
+        self.eps2 = eps2
+
+        self.num_steps: int = 0
+        self.gather_norm: bool = False
+        self.grad_norms: List[torch.Tensor] = []
+        self.clip_coef: Optional[float] = None
+
+        self.local_rank: int = int(os.environ.get('LOCAL_RANK', 0))
+        self.zero3_enabled: bool = is_deepspeed_zero3_enabled()
+
+        self.grad_func: Callable[[Any], Any] = self.fuse_update_zero3() if self.zero3_enabled else self.fuse_update()
+
+        self.exp_avg_sq = {}
+        self.exp_avg_sq_row = {}
+        self.exp_avg_sq_col = {}
+
+        self.initialize_states()
+
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'weight_decay': weight_decay,
+            'clip_grad_norm': clip_grad_norm,
+            'clip_grad_value': clip_grad_value,
+            'eps1': eps1,
+            'eps2': eps2,
+        }
+        super().__init__(self.model.parameters(), defaults)
+
+    def __str__(self) -> str:
+        return 'AdaLOMO'
+
+    def initialize_states(self) -> None:
+        for n, p in self.model.named_parameters():
+            if self.zero3_enabled:  # pragma: no cover
+                if len(p.ds_shape) == 1:
+                    self.exp_avg_sq[n] = torch.zeros(p.ds_shape[0], dtype=torch.float32, device=p.device)
+                else:
+                    self.exp_avg_sq_row[n] = torch.zeros(p.ds_shape[0], dtype=torch.float32, device=p.device)
+                    self.exp_avg_sq_col[n] = torch.zeros(p.ds_shape[1], dtype=torch.float32, device=p.device)
+            elif len(p.shape) == 1:
+                self.exp_avg_sq[n] = torch.zeros(p.shape[0], dtype=torch.float32, device=p.device)
+            else:
+                self.exp_avg_sq_row[n] = torch.zeros(p.shape[0], dtype=torch.float32, device=p.device)
+                self.exp_avg_sq_col[n] = torch.zeros(p.shape[1], dtype=torch.float32, device=p.device)
+
+            if p.requires_grad:
+                p.register_hook(self.grad_func)
+
+    @torch.no_grad()
+    def reset(self):
+        pass
+
+    def fuse_update(self) -> Callable[[Any], Any]:
+        @torch.no_grad()
+        def func(x: Any) -> Any:
+            for n, p in self.model.named_parameters():
+                if not p.requires_grad or p.grad is None:
+                    continue
+
+                grad_fp32 = p.grad.to(torch.float32)
+                p.grad = None
+
+                if self.loss_scale:
+                    grad_fp32.div_(self.loss_scale)
+
+                if self.gather_norm:
+                    self.grad_norms.append(torch.norm(grad_fp32, 2.0))
+                else:
+                    if self.clip_grad_value is not None and self.clip_grad_value > 0.0:
+                        grad_fp32.clamp_(min=-self.clip_grad_value, max=self.clip_grad_value)
+                    if self.clip_grad_norm is not None and self.clip_grad_norm > 0.0 and self.clip_coef is not None:
+                        grad_fp32.mul_(self.clip_coef)
+
+                    beta2_t: float = 1.0 - math.pow(
+                        self.num_steps, self.decay_rate if self.num_steps > 0 else -self.decay_rate
+                    )
+
+                    update = grad_fp32.pow(2).add_(self.eps1)
+
+                    if len(p.shape) > 1:
+                        self.exp_avg_sq_row[n].mul_(beta2_t).add_(update.mean(dim=-1), alpha=1.0 - beta2_t)
+                        self.exp_avg_sq_col[n].mul_(beta2_t).add_(update.mean(dim=-2), alpha=1.0 - beta2_t)
+
+                        self.approximate_sq_grad(self.exp_avg_sq_row[n], self.exp_avg_sq_col[n], update)
+                        update.mul_(grad_fp32)
+                    else:
+                        self.exp_avg_sq[n].mul_(beta2_t).add_(update, alpha=1.0 - beta2_t)
+                        update = self.exp_avg_sq[n].rsqrt().mul_(grad_fp32)
+
+                    update.div_((self.get_rms(update) / self.clip_threshold).clamp_(min=1.0))
+
+                    p_fp32 = p.to(torch.float32)
+                    p_rms = torch.norm(p_fp32, 2.0) / math.sqrt(p.numel())
+
+                    lr = self.lr * max(self.eps2, p_rms)
+
+                    self.apply_weight_decay(
+                        p,
+                        grad_fp32,
+                        lr,
+                        self.weight_decay,
+                        weight_decouple=True,
+                        fixed_decay=False,
+                    )
+
+                    p_fp32.add_(grad_fp32, alpha=-lr)
+                    p.copy_(p_fp32)
+
+            return x
+
+        return func
+
+    def fuse_update_zero3(self) -> Callable[[Any], Any]:  # pragma: no cover
+        @torch.no_grad()
+        def func(x: torch.Tensor) -> torch.Tensor:
+            for n, p in self.model.named_parameters():
+                if p.grad is None:
+                    continue
+
+                all_reduce(p.grad, op=ReduceOp.AVG, async_op=False)
+
+                grad_fp32 = p.grad.to(torch.float32)
+                p.grad = None
+
+                if self.loss_scale:
+                    grad_fp32.div_(self.loss_scale)
+
+                if self.gather_norm:
+                    self.grad_norms.append(torch.norm(grad_fp32, 2.0))
+                else:
+                    partition_size: int = p.ds_tensor.numel()
+                    start = partition_size * self.local_rank
+                    end = min(start + partition_size, grad_fp32.numel())
+
+                if self.clip_grad_value is not None:
+                    grad_fp32.clamp_(min=-self.clip_grad_value, max=self.clip_grad_value)
+                if self.clip_grad_norm is not None and self.clip_grad_norm > 0 and self.clip_coef is not None:
+                    grad_fp32.mul_(self.clip_coef)
+
+                beta2_t: float = 1.0 - math.pow(
+                    self.num_steps, self.decay_rate if self.num_steps > 0 else -self.decay_rate
+                )
+
+                update = grad_fp32.pow(2).add_(self.eps1)
+
+                if len(p.ds_shape) > 1:
+                    self.exp_avg_sq_row[n].mul_(beta2_t).add_(update.mean(dim=-1), alpha=1.0 - beta2_t)
+                    self.exp_avg_sq_col[n].mul_(beta2_t).add_(update.mean(dim=-2), alpha=1.0 - beta2_t)
+
+                    self.approximate_sq_grad(self.exp_avg_sq_row[n], self.exp_avg_sq_col[n], update)
+                    update.mul_(grad_fp32)
+                else:
+                    self.exp_avg_sq[n].mul_(beta2_t).add_(update, alpha=1.0 - beta2_t)
+                    update = self.exp_avg_sq[n].rsqrt().mul_(grad_fp32)
+
+                update.div_((self.get_rms(update) / self.clip_threshold).clamp_(min=1.0))
+
+                one_dim_update = update.view(-1)
+                partitioned_update = one_dim_update.narrow(0, start, end - start)
+
+                param_fp32 = p.ds_tensor.to(torch.float32)
+                partitioned_p = param_fp32.narrow(0, 0, end - start)
+
+                p_rms = torch.norm(partitioned_p, 2.0).pow_(2)
+                all_reduce(p_rms, op=ReduceOp.SUM)
+                p_rms.div_(p.ds_numel).sqrt_()
+
+                lr = self.lr * max(self.eps2, p_rms)
+
+                self.apply_weight_decay(
+                    partitioned_p,
+                    grad_fp32,
+                    lr,
+                    self.weight_decay,
+                    weight_decouple=True,
+                    fixed_decay=False,
+                )
+
+                partitioned_p.add_(partitioned_update, alpha=-lr)
+
+                p.ds_tensor[: end - start] = partitioned_p
+
+            return x
+
+        return func
+
+    def fused_backward(self, loss, lr: float) -> None:
+        self.lr = lr
+
+        if self.loss_scale:
+            loss = loss * self.loss_scale
+
+        self.num_steps += 1
+
+        loss.backward()
+
+        self.grad_func(0)
+
+    def grad_norm(self, loss) -> None:
+        self.gather_norm = True
+        self.grad_norms = []
+
+        if self.loss_scale:
+            loss = loss * self.loss_scale
+
+        loss.backward(retain_graph=True)
+
+        self.grad_func(0)
+
+        with torch.no_grad():
+            self.grad_norms = torch.stack(self.grad_norms)
+
+            total_norm = torch.norm(self.grad_norms, 2.0)
+            self.clip_coef = torch.clamp(float(self.clip_grad_norm) / (total_norm + 1e-6), max=1.0)
+
+        self.gather_norm = False
diff --git a/pytorch_optimizer/optimizer/utils.py b/pytorch_optimizer/optimizer/utils.py
index 084f3576a..464cc3e9f 100644
--- a/pytorch_optimizer/optimizer/utils.py
+++ b/pytorch_optimizer/optimizer/utils.py
@@ -1,4 +1,6 @@
 import math
+import warnings
+from importlib.util import find_spec
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -11,6 +13,28 @@
 
 from pytorch_optimizer.base.types import PARAMETERS
 
+HAS_TRANSFORMERS: bool = find_spec('transformers') is not None
+
+if HAS_TRANSFORMERS:  # pragma: no cover
+    try:
+        from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+    except ImportError:
+        from transformers.deepspeed import is_deepspeed_zero3_enabled
+else:
+
+    def is_deepspeed_zero3_enabled() -> bool:
+        r"""Check if DeepSpeed zero3 is enabled."""
+        if HAS_TRANSFORMERS:
+            return is_deepspeed_zero3_enabled()  # pragma: no cover
+
+        warnings.warn(
+            'you need to install `transformers` to use `is_deepspeed_zero3_enabled` function. it\'ll return False.',
+            category=ImportWarning,
+            stacklevel=2,
+        )
+
+        return False
+
 
 def debias_beta(beta: float, step: int) -> float:
     r"""Apply the Adam-style debias correction into beta.
diff --git a/requirements-dev.txt b/requirements-dev.txt
index da0c1d41b..e431b1b07 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,8 +3,8 @@
 black==24.4.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 click==8.1.7 ; python_version >= "3.8" and python_full_version < "4.0.0"
 colorama==0.4.6 ; python_version >= "3.8" and python_full_version < "4.0.0" and (sys_platform == "win32" or platform_system == "Windows")
-coverage[toml]==7.5.4 ; python_version >= "3.8" and python_full_version < "4.0.0"
-exceptiongroup==1.2.1 ; python_version >= "3.8" and python_version < "3.11"
+coverage[toml]==7.6.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
+exceptiongroup==1.2.2 ; python_version >= "3.8" and python_version < "3.11"
 filelock==3.15.4 ; python_version >= "3.8" and python_full_version < "4.0.0"
 fsspec==2024.6.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 iniconfig==2.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
@@ -24,7 +24,7 @@ pluggy==1.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 pytest-cov==5.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 pytest==8.2.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 ruff==0.5.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
-sympy==1.12.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
+sympy==1.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 tbb==2021.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows"
 tomli==2.0.1 ; python_version >= "3.8" and python_full_version <= "3.11.0a6"
 torch==2.3.1+cpu ; python_version >= "3.8" and python_full_version < "4.0.0"
diff --git a/requirements.txt b/requirements.txt
index 2dc1435a9..deac15a7f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ mkl==2021.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and pl
 mpmath==1.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 networkx==3.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 numpy==1.24.4 ; python_version >= "3.8" and python_full_version < "4.0.0"
-sympy==1.12.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
+sympy==1.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 tbb==2021.13.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows"
 torch==2.3.1+cpu ; python_version >= "3.8" and python_full_version < "4.0.0"
 typing-extensions==4.12.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
diff --git a/tests/test_create_optimizer.py b/tests/test_create_optimizer.py
index d84589fdf..56d9c5914 100644
--- a/tests/test_create_optimizer.py
+++ b/tests/test_create_optimizer.py
@@ -14,3 +14,8 @@ def test_create_optimizer():
 def test_bnb_optimizer():
     with pytest.raises(ImportError):
         load_optimizer('bnb_adamw8bit')
+
+
+def test_q_galore_optimizer():
+    with pytest.raises(ImportError):
+        load_optimizer('q_galore_adamw8bit')
diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py
index 58a3c2b5e..800b6e839 100644
--- a/tests/test_general_optimizer_parameters.py
+++ b/tests/test_general_optimizer_parameters.py
@@ -50,6 +50,7 @@ def test_epsilon(optimizer_name):
         'came',
         'adalite',
         'bsam',
+        'adalomo',
     ):
         pytest.skip(f'skip {optimizer_name} optimizer')
 
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
index 290c628fa..76e5a301f 100644
--- a/tests/test_gradients.py
+++ b/tests/test_gradients.py
@@ -9,7 +9,7 @@
 
 @pytest.mark.parametrize('optimizer_name', [*VALID_OPTIMIZER_NAMES, 'lookahead'])
 def test_no_gradients(optimizer_name):
-    if optimizer_name in {'lomo', 'adammini'}:
+    if optimizer_name in {'lomo', 'adalomo', 'adammini'}:
         pytest.skip(f'skip {optimizer_name} optimizer.')
 
     p1 = simple_parameter(require_grad=True)
@@ -39,7 +39,7 @@ def test_no_gradients(optimizer_name):
 
 @pytest.mark.parametrize('no_sparse_optimizer', NO_SPARSE_OPTIMIZERS)
 def test_sparse_not_supported(no_sparse_optimizer):
-    if no_sparse_optimizer in ('lomo', 'bsam', 'adammini'):
+    if no_sparse_optimizer in {'lomo', 'adalomo', 'bsam', 'adammini'}:
         pytest.skip(f'skip {no_sparse_optimizer} optimizer.')
 
     param = simple_sparse_parameter()[1]
@@ -113,7 +113,7 @@ def test_sparse_supported(sparse_optimizer):
 
 @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES)
 def test_bf16_gradient(optimizer_name):
-    if optimizer_name in ('shampoo', 'lomo', 'bsam', 'adammini'):
+    if optimizer_name in {'shampoo', 'lomo', 'adalomo', 'bsam', 'adammini'}:
         pytest.skip(f'skip {optimizer_name}')
 
     param = torch.randn(1, 1).bfloat16().requires_grad_(True)
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
index c5831884b..8b5fc4d0d 100644
--- a/tests/test_load_modules.py
+++ b/tests/test_load_modules.py
@@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 71
+    assert len(get_supported_optimizers()) == 72
 
 
 def test_get_supported_lr_schedulers():
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
index 07f25c00f..21cdd86fa 100644
--- a/tests/test_optimizers.py
+++ b/tests/test_optimizers.py
@@ -28,6 +28,7 @@
     PULLBACK_MOMENTUM,
 )
 from tests.utils import (
+    Example,
     MultiHeadLogisticRegression,
     build_environment,
     dummy_closure,
@@ -545,15 +546,16 @@ def test_sm3_rank0():
     assert str(optimizer) == 'SM3'
 
 
-def test_lomo_deepspeed_zero3(environment):
+@pytest.mark.parametrize('optimizer_name', ['lomo', 'adalomo'])
+def test_lomo_deepspeed_zero3(optimizer_name, environment):
     _, model, _ = environment
 
     model.fc1.weight.__setattr__('ds_tensor', 0)
 
-    optimizer = load_optimizer('lomo')(model)
+    optimizer = load_optimizer(optimizer_name)(model)
     optimizer.reset()
 
-    assert str(optimizer) == 'LOMO'
+    assert str(optimizer).lower() == optimizer_name
 
 
 def test_lomo_clip_grad_norm_with_fp16(environment):
@@ -566,34 +568,45 @@ def test_lomo_clip_grad_norm_with_fp16(environment):
         load_optimizer('lomo')(model, clip_grad_norm=None)
 
 
-def test_lomo_fused_backward(environment):
+@pytest.mark.parametrize('optimizer_name', ['lomo'])
+def test_lomo_fused_backward(optimizer_name, environment):
     _, model, _ = environment
 
-    optimizer = load_optimizer('lomo')(model, clip_grad_norm=1.0)
+    optimizer = load_optimizer(optimizer_name)(model, clip_grad_norm=1.0)
     with pytest.raises(ValueError):
         optimizer.fused_backward(loss=0.1, lr=0.1)
 
 
+@pytest.mark.parametrize('optimizer_name', ['lomo', 'adalomo'])
 @pytest.mark.parametrize('precision', [16, 32])
-def test_lomo_optimizer(precision, environment):
-    _, model, _ = environment
+def test_lomo_optimizer(optimizer_name, precision):
+    model = Example()
+
+    model.fc1.bias.data = torch.randn(1, dtype=torch.float32)
+    model.fc1.bias.grad = torch.randn(1, dtype=torch.float32)
 
     if precision == 16:
-        model.fc1.weight.data = torch.randn(2, 2, dtype=torch.float16)
-        model.fc1.weight.grad = torch.zeros(2, 2, dtype=torch.float16)
+        model.fc1.weight.data = torch.randn(1, 1, dtype=torch.float16)
+        model.fc1.weight.grad = torch.zeros(1, 1, dtype=torch.float16)
 
-    optimizer = load_optimizer('lomo')(model, clip_grad_norm=1.0, clip_grad_value=1.0)
+    optimizer = load_optimizer(optimizer_name)(model, clip_grad_norm=1.0, clip_grad_value=1.0)
 
     if precision == 16:
         optimizer.clip_coef = 0.9
 
-    loss = sphere_loss(next(iter(model.parameters())))
+    parameters = iter(model.parameters())
+
+    loss = sphere_loss(next(parameters))
+    optimizer.grad_norm(loss)
+    optimizer.fused_backward(loss, lr=0.1)
+
+    loss = sphere_loss(next(parameters))
     optimizer.grad_norm(loss)
     optimizer.fused_backward(loss, lr=0.1)
 
 
 def test_dynamic_scaler():
-    scaler = DynamicLossScaler(init_scale=2.0**15, scale_window=1, threshold=1e-2)
+    scaler = DynamicLossScaler(init_scale=2.0 ** 15, scale_window=1, threshold=1e-2)  # fmt: skip
     scaler.decrease_loss_scale()
     scaler.update_scale(overflow=False)