From 715fda59e02d693689771de72c1ff7192230f134 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:37:30 +0900 Subject: [PATCH 01/28] docs: bSAM optimizer --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e8f6267dc..a187669ff 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **64 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! +Currently, **65 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -162,6 +162,7 @@ supported_optimizers = get_supported_optimizers() | Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | | GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | | Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | ## Supported LR Scheduler From c9ee9e2a6b8b60e40fcef6e3f78552ffce1b520c Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:37:35 +0900 Subject: [PATCH 02/28] docs: bSAM optimizer --- docs/changelogs/v3.0.0.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/changelogs/v3.0.0.md b/docs/changelogs/v3.0.0.md index eb1f6aedc..3e836fff5 100644 --- a/docs/changelogs/v3.0.0.md +++ b/docs/changelogs/v3.0.0.md @@ -13,6 +13,8 @@ Major version is updated! (`v2.12.0` -> `v3.0.0`) (#164) * Implement `GaLore` optimizer. (#224, #228) * [Memory-Efficient LLM Training by Gradient Low-Rank Projection](https://arxiv.org/abs/2403.03507) * Implement `Adalite` optimizer. (#225, #229) +* Implement `bSAM` optimizer. (#233) + * [SAM as an Optimal Relaxation of Bayes](https://arxiv.org/abs/2210.01620) ### Fix @@ -35,4 +37,5 @@ thanks to @sdbds, @i404788 ## Diff -[2.12.0...3.0.0](https://github.com/kozistr/pytorch_optimizer/compare/v2.12.0...v3.0.0) +* from the previous major version : [2.0.0...3.0.0](https://github.com/kozistr/pytorch_optimizer/compare/v2.0.0...v3.0.0) +* from the previous version: [2.12.0...3.0.0](https://github.com/kozistr/pytorch_optimizer/compare/v2.12.0...v3.0.0) From 9bfc7e5ea038e64d952d257404ab55bf59731b12 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:38:04 +0900 Subject: [PATCH 03/28] docs: bSAM optimizer --- docs/optimizer.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/optimizer.md b/docs/optimizer.md index 6c08ba865..0ea4a5049 100644 --- a/docs/optimizer.md +++ b/docs/optimizer.md @@ -96,6 +96,10 @@ :docstring: :members: +::: pytorch_optimizer.BSAM + :docstring: + :members: + ::: pytorch_optimizer.CAME :docstring: :members: From 67b702001fc5a1185035b42747547e3562bae1e1 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:38:12 +0900 Subject: [PATCH 04/28] docs: README --- docs/index.md | 150 ++++++++++++++++++++++++++------------------------ 1 file changed, 79 insertions(+), 71 deletions(-) diff --git a/docs/index.md b/docs/index.md index 939258fbf..a187669ff 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **60 optimizers (+ `bitsandbytes`)**, **10 lr schedulers**, and **13 loss functions** are supported! +Currently, **65 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -27,9 +27,11 @@ So, please double-check the license before using it at your work. $ pip3 install pytorch-optimizer ``` -From `pytorch-optimizer v2.12.0`, you can install and import `bitsandbytes` optimizers. +From `v2.12.0`, you can install and import `bitsandbytes` optimizers. please check [the requirements](https://github.com/TimDettmers/bitsandbytes?tab=readme-ov-file#tldr) before installing it. +From `v3.0.0`, drop `Python 3.7` support. However, you can still use this package with `Python 3.7` by installing with `--ignore-requires-python` option. + ```bash $ pip install "pytorch-optimizer[bitsandbytes]" ``` @@ -91,71 +93,76 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|--------------|---------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | -| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| Optimizer | Description | Official Code | Paper | Citation | +|--------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | ## Supported LR Scheduler @@ -167,10 +174,11 @@ from pytorch_optimizer import get_supported_lr_schedulers supported_lr_schedulers = get_supported_lr_schedulers() ``` -| LR Scheduler | Description | Official Code | Paper | Citation | -|-----------------|---------------------------------------------------------------------------------|---------------|------------------------------------|------------------------------------------------------------------------------| -| Explore-Exploit | *Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule* | | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200303977I/exportcitation) | -| Chebyshev | *Acceleration via Fractal Learning Rate Schedules* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210301338A/exportcitation) | +| LR Scheduler | Description | Official Code | Paper | Citation | +|-----------------|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|------------------------------------|------------------------------------------------------------------------------| +| Explore-Exploit | *Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule* | | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200303977I/exportcitation) | +| Chebyshev | *Acceleration via Fractal Learning Rate Schedules* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210301338A/exportcitation) | +| REX | *Revisiting Budgeted Training with an Improved Schedule* | [github](https://github.com/Nerogar/OneTrainer/blob/2c6f34ea0838e5a86774a1cf75093d7e97c70f03/modules/util/lr_scheduler_util.py#L66) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210704197C/exportcitation) | ## Supported Loss Function From b289288cff8aa24f5db20fc681771c1c170c0f81 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:38:20 +0900 Subject: [PATCH 05/28] feature: implement bSAM optimizer --- pytorch_optimizer/optimizer/sam.py | 186 +++++++++++++++++++++++++++-- 1 file changed, 174 insertions(+), 12 deletions(-) diff --git a/pytorch_optimizer/optimizer/sam.py b/pytorch_optimizer/optimizer/sam.py index 4e919952e..22e2a9041 100644 --- a/pytorch_optimizer/optimizer/sam.py +++ b/pytorch_optimizer/optimizer/sam.py @@ -10,7 +10,7 @@ from pytorch_optimizer.base.exception import NoClosureError from pytorch_optimizer.base.optimizer import BaseOptimizer -from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, OPTIMIZER, PARAMETERS +from pytorch_optimizer.base.types import BETAS, CLOSURE, DEFAULTS, OPTIMIZER, PARAMETERS from pytorch_optimizer.optimizer.utils import disable_running_stats, enable_running_stats @@ -58,6 +58,7 @@ def closure(): :param base_optimizer: Optimizer. base optimizer. :param rho: float. size of the neighborhood for computing the max loss. :param adaptive: bool. element-wise Adaptive SAM. + :param perturb_eps: float. eps for perturbation. :param kwargs: Dict. parameters for optimizer. """ @@ -67,9 +68,13 @@ def __init__( base_optimizer: OPTIMIZER, rho: float = 0.05, adaptive: bool = False, + perturb_eps: float = 1e-12, **kwargs, ): self.validate_non_negative(rho, 'rho') + self.validate_non_negative(perturb_eps, 'perturb_eps') + + self.perturb_eps = perturb_eps defaults: DEFAULTS = {'rho': rho, 'adaptive': adaptive} defaults.update(kwargs) @@ -89,7 +94,7 @@ def reset(self): def first_step(self, zero_grad: bool = False): grad_norm = self.grad_norm() for group in self.param_groups: - scale = group['rho'] / (grad_norm + 1e-12) + scale = group['rho'] / (grad_norm + self.perturb_eps) for p in group['params']: if p.grad is None: @@ -98,7 +103,6 @@ def first_step(self, zero_grad: bool = False): self.state[p]['old_p'] = p.clone() e_w = (torch.pow(p, 2) if group['adaptive'] else 1.0) * p.grad * scale.to(p) - # climb to the local maximum "w + e(w)" p.add_(e_w) if zero_grad: @@ -111,10 +115,8 @@ def second_step(self, zero_grad: bool = False): if p.grad is None: continue - # get back to "w" from "w + e(w)" p.data = self.state[p]['old_p'] - # do the actual "sharpness-aware" update self.base_optimizer.step() if zero_grad: @@ -127,14 +129,12 @@ def step(self, closure: CLOSURE = None): self.first_step(zero_grad=True) - # the closure should do a full forward-backward pass with torch.enable_grad(): closure() self.second_step() def grad_norm(self) -> torch.Tensor: - # put everything on the same device, in case of model parallelism shared_device = self.param_groups[0]['params'][0].device return torch.norm( torch.stack( @@ -248,7 +248,8 @@ def perturb_weights(self, rho: float): self.state[p]['old_g'] = p.grad.clone() e_w = (torch.pow(p, 2) if self.adaptive else 1.0) * p.grad * scale.to(p) - p.add_(e_w) # climb to the local maximum "w + e(w)" + + p.add_(e_w) self.state[p]['e_w'] = e_w @@ -274,7 +275,6 @@ def gradient_decompose(self, alpha: float = 0.0): cosine = inner_prod / (new_grad_norm * old_grad_norm + self.perturb_eps) - # gradient decomposition for group in self.param_groups: for p in group['params']: if p.grad is None: @@ -408,6 +408,7 @@ def __init__( defaults: DEFAULTS = {'rho': rho, 'alpha': alpha, 'adaptive': adaptive, 'sam_eps': eps} defaults.update(kwargs) + super().__init__(params, defaults) self.base_optimizer = base_optimizer(self.param_groups, **kwargs) @@ -432,7 +433,6 @@ def first_step(self, zero_grad: bool = False): e_w = (torch.pow(p, 2) if group['adaptive'] else 1.0) * p.grad * scale.to(p) - # climb to the local maximum "w + e(w)" p.add_(e_w) self.state[p]['e_w'] = e_w @@ -460,7 +460,6 @@ def second_step(self, zero_grad: bool = False): if is_initialized(): # pragma: no cover all_reduce(p.grad, ReduceOp.AVG) - # get back to "w" from "w + e(w)" p.add_(self.state[p]['e_w'], alpha=-1.0) if self.max_norm is not None: @@ -477,7 +476,6 @@ def second_step(self, zero_grad: bool = False): self.state[p]['sharpness'] = p.grad.clone() - self.state[p]['grad'] p.grad.mul_(0.0).add_(self.state[p]['grad'], alpha=1.0) - # do the actual "sharpness-aware" update self.base_optimizer.step() if self.decouple: @@ -500,16 +498,19 @@ def step(self, closure: CLOSURE = None): enable_running_stats(self.model) loss = closure() + self.first_step(zero_grad=True) disable_running_stats(self.model) closure() + self.second_step() return loss def grad_norm(self) -> torch.Tensor: shared_device = self.param_groups[0]['params'][0].device + return torch.norm( torch.stack( [ @@ -525,3 +526,164 @@ def grad_norm(self) -> torch.Tensor: def load_state_dict(self, state_dict: Dict): super().load_state_dict(state_dict) self.base_optimizer.param_groups = self.param_groups + + +class BSAM(Optimizer, BaseOptimizer): + r"""SAM as an Optimal Relaxation of Bayes. + + Example: + ------- + Here's an example:: + + model = YourModel() + optimizer = BSAM(model.parameters(), ...) + + for input, output in data: + # first forward-backward pass + + loss = loss_function(output, model(input)) + loss.backward() + optimizer.step(zero_grad=True) + + # second forward-backward pass + # make sure to do a full forward pass + loss_function(output, model(input)).backward() + optimizer.second_step(zero_grad=True) + + # third forward-backward pass + # make sure to do a full forward pass + loss_function(output, model(input)).backward() + optimizer.second_step(zero_grad=True) + + Alternative example with a single closure-based step function:: + + model = YourModel() + optimizer = BSAM(model.parameters(), ...) + + def closure(): + loss = loss_function(output, model(input)) + loss.backward() + return loss + + for input, output in data: + loss = loss_function(output, model(input)) + loss.backward() + optimizer.step(closure) + optimizer.zero_grad() + + :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups. + :param num_data: int. number of training data. + :param lr: float. learning rate. + :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace. + :param weight_decay: float. weight decay (L2 penalty). + :param rho: float. size of the neighborhood for computing the max loss. + :param adaptive: bool. element-wise Adaptive SAM. + :param damping: float. damping to stabilize the method. + :param kwargs: Dict. parameters for optimizer. + """ + + def __init__( + self, + params: PARAMETERS, + num_data: int, + lr: float = 5e-1, + betas: BETAS = (0.9, 0.999), + weight_decay: float = 1e-4, + rho: float = 0.05, + adaptive: bool = False, + damping: float = 0.1, + **kwargs, + ): + self.validate_learning_rate(lr) + self.validate_betas(betas) + self.validate_non_negative(weight_decay, 'weight_decay') + self.validate_non_negative(rho, 'rho') + self.validate_non_negative(num_data, 'num_data') + self.validate_non_negative(damping, 'damping') + + self.num_data = num_data + self.damping = damping + + defaults: DEFAULTS = {'lr': lr, 'betas': betas, 'weight_decay': weight_decay, 'rho': rho, 'adaptive': adaptive} + defaults.update(kwargs) + super().__init__(params, defaults) + + def __str__(self) -> str: + return 'bSAM' + + @torch.no_grad() + def reset(self): + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + state['s'] = torch.ones_like(p) + state['noisy_gradient'] = torch.zeros_like(p.grad) + state['momentum'] = torch.zeros_like(p) + + @torch.no_grad() + def first_step(self): + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + noise = torch.normal(0.0, 1 / (self.num_data * state['s'])) + + p.add_(noise) + + @torch.no_grad() + def second_step(self): + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + state['noisy_gradient'] = p.grad.clone() + + e_w = (torch.pow(p, 2) if group['adaptive'] else 1.0) * group['rho'] * p.grad / state['s'] + + p.add_(e_w) + + @torch.no_grad() + def third_step(self): + for group in self.param_groups: + beta1, beta2 = group['betas'] + weight_decay = group['weight_decay'] + for p in group['params']: + if p.grad is None: + continue + + state = self.state[p] + + momentum, s = state['momentum'], state['s'] + momentum.mul_(beta1).add_(p.grad * weight_decay, alpha=1.0 - beta1) + + var = (torch.sqrt(s).mul_(p.grad.abs()).add_(weight_decay + self.damping)).pow_(2) + s.mul_(beta2).add_(var, alpha=1.0 - beta2) + + p.add_(momentum / s, alpha=-group['lr']) + + @torch.no_grad() + def step(self, closure: CLOSURE = None): + if closure is None: + raise NoClosureError(str(self)) + + self.first_step() + + with torch.enable_grad(): + closure() + + self.second_step() + + with torch.enable_grad(): + closure() + + self.third_step() From 61385bca41915ef0b349b06909d45c1931c361c8 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 15:38:30 +0900 Subject: [PATCH 06/28] build(deps): packages --- poetry.lock | 301 +++++++++++++++++++++++++------------------ pyproject.toml | 6 +- requirements-dev.txt | 21 +-- requirements.txt | 7 +- 4 files changed, 194 insertions(+), 141 deletions(-) diff --git a/poetry.lock b/poetry.lock index f8cc69312..b63cdbb3b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,14 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "bitsandbytes" -version = "0.43.0" +version = "0.43.1" description = "k-bit optimizers and matrix multiplication routines." optional = true python-versions = "*" files = [ - {file = "bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:b2626ada0ae447ae0cf3dd0be8f5b0abad7abdec7056c7fb738aa13a5a862007"}, - {file = "bitsandbytes-0.43.0-py3-none-win_amd64.whl", hash = "sha256:6fa7f3255fe9f3e549fb110bc60794079761a4e608b5fb86ebe7b4047467dd99"}, + {file = "bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:a81c826d576d6d691c7b4a7491c8fdc0f37f769795d6ca2e54afa605d2c260a3"}, + {file = "bitsandbytes-0.43.1-py3-none-win_amd64.whl", hash = "sha256:52c1c7189a6ca006555a9663e544e75f40520a97a26e075411f9f9aca0771fcd"}, ] [package.dependencies] @@ -21,33 +21,33 @@ test = ["scipy"] [[package]] name = "black" -version = "24.3.0" +version = "24.4.2" description = "The uncompromising code formatter." optional = false python-versions = ">=3.8" files = [ - {file = "black-24.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7d5e026f8da0322b5662fa7a8e752b3fa2dac1c1cbc213c3d7ff9bdd0ab12395"}, - {file = "black-24.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9f50ea1132e2189d8dff0115ab75b65590a3e97de1e143795adb4ce317934995"}, - {file = "black-24.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2af80566f43c85f5797365077fb64a393861a3730bd110971ab7a0c94e873e7"}, - {file = "black-24.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:4be5bb28e090456adfc1255e03967fb67ca846a03be7aadf6249096100ee32d0"}, - {file = "black-24.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4f1373a7808a8f135b774039f61d59e4be7eb56b2513d3d2f02a8b9365b8a8a9"}, - {file = "black-24.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:aadf7a02d947936ee418777e0247ea114f78aff0d0959461057cae8a04f20597"}, - {file = "black-24.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65c02e4ea2ae09d16314d30912a58ada9a5c4fdfedf9512d23326128ac08ac3d"}, - {file = "black-24.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf21b7b230718a5f08bd32d5e4f1db7fc8788345c8aea1d155fc17852b3410f5"}, - {file = "black-24.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:2818cf72dfd5d289e48f37ccfa08b460bf469e67fb7c4abb07edc2e9f16fb63f"}, - {file = "black-24.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4acf672def7eb1725f41f38bf6bf425c8237248bb0804faa3965c036f7672d11"}, - {file = "black-24.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7ed6668cbbfcd231fa0dc1b137d3e40c04c7f786e626b405c62bcd5db5857e4"}, - {file = "black-24.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:56f52cfbd3dabe2798d76dbdd299faa046a901041faf2cf33288bc4e6dae57b5"}, - {file = "black-24.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:79dcf34b33e38ed1b17434693763301d7ccbd1c5860674a8f871bd15139e7837"}, - {file = "black-24.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e19cb1c6365fd6dc38a6eae2dcb691d7d83935c10215aef8e6c38edee3f77abd"}, - {file = "black-24.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:65b76c275e4c1c5ce6e9870911384bff5ca31ab63d19c76811cb1fb162678213"}, - {file = "black-24.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:b5991d523eee14756f3c8d5df5231550ae8993e2286b8014e2fdea7156ed0959"}, - {file = "black-24.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c45f8dff244b3c431b36e3224b6be4a127c6aca780853574c00faf99258041eb"}, - {file = "black-24.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6905238a754ceb7788a73f02b45637d820b2f5478b20fec82ea865e4f5d4d9f7"}, - {file = "black-24.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7de8d330763c66663661a1ffd432274a2f92f07feeddd89ffd085b5744f85e7"}, - {file = "black-24.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:7bb041dca0d784697af4646d3b62ba4a6b028276ae878e53f6b4f74ddd6db99f"}, - {file = "black-24.3.0-py3-none-any.whl", hash = "sha256:41622020d7120e01d377f74249e677039d20e6344ff5851de8a10f11f513bf93"}, - {file = "black-24.3.0.tar.gz", hash = "sha256:a0c9c4a0771afc6919578cec71ce82a3e31e054904e7197deacbc9382671c41f"}, + {file = "black-24.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dd1b5a14e417189db4c7b64a6540f31730713d173f0b63e55fabd52d61d8fdce"}, + {file = "black-24.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e537d281831ad0e71007dcdcbe50a71470b978c453fa41ce77186bbe0ed6021"}, + {file = "black-24.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaea3008c281f1038edb473c1aa8ed8143a5535ff18f978a318f10302b254063"}, + {file = "black-24.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7768a0dbf16a39aa5e9a3ded568bb545c8c2727396d063bbaf847df05b08cd96"}, + {file = "black-24.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:257d724c2c9b1660f353b36c802ccece186a30accc7742c176d29c146df6e474"}, + {file = "black-24.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bdde6f877a18f24844e381d45e9947a49e97933573ac9d4345399be37621e26c"}, + {file = "black-24.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e151054aa00bad1f4e1f04919542885f89f5f7d086b8a59e5000e6c616896ffb"}, + {file = "black-24.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:7e122b1c4fb252fd85df3ca93578732b4749d9be076593076ef4d07a0233c3e1"}, + {file = "black-24.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:accf49e151c8ed2c0cdc528691838afd217c50412534e876a19270fea1e28e2d"}, + {file = "black-24.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:88c57dc656038f1ab9f92b3eb5335ee9b021412feaa46330d5eba4e51fe49b04"}, + {file = "black-24.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be8bef99eb46d5021bf053114442914baeb3649a89dc5f3a555c88737e5e98fc"}, + {file = "black-24.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:415e686e87dbbe6f4cd5ef0fbf764af7b89f9057b97c908742b6008cc554b9c0"}, + {file = "black-24.4.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bf10f7310db693bb62692609b397e8d67257c55f949abde4c67f9cc574492cc7"}, + {file = "black-24.4.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:98e123f1d5cfd42f886624d84464f7756f60ff6eab89ae845210631714f6db94"}, + {file = "black-24.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48a85f2cb5e6799a9ef05347b476cce6c182d6c71ee36925a6c194d074336ef8"}, + {file = "black-24.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:b1530ae42e9d6d5b670a34db49a94115a64596bc77710b1d05e9801e62ca0a7c"}, + {file = "black-24.4.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:37aae07b029fa0174d39daf02748b379399b909652a806e5708199bd93899da1"}, + {file = "black-24.4.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:da33a1a5e49c4122ccdfd56cd021ff1ebc4a1ec4e2d01594fef9b6f267a9e741"}, + {file = "black-24.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef703f83fc32e131e9bcc0a5094cfe85599e7109f896fe8bc96cc402f3eb4b6e"}, + {file = "black-24.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:b9176b9832e84308818a99a561e90aa479e73c523b3f77afd07913380ae2eab7"}, + {file = "black-24.4.2-py3-none-any.whl", hash = "sha256:d36ed1124bb81b32f8614555b34cc4259c3fbc7eec17870e8ff8ded335b58d8c"}, + {file = "black-24.4.2.tar.gz", hash = "sha256:c872b53057f000085da66a19c55d68f6f8ddcac2642392ad3a355878406fbd4d"}, ] [package.dependencies] @@ -92,63 +92,63 @@ files = [ [[package]] name = "coverage" -version = "7.4.4" +version = "7.5.1" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0be5efd5127542ef31f165de269f77560d6cdef525fffa446de6f7e9186cfb2"}, - {file = "coverage-7.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ccd341521be3d1b3daeb41960ae94a5e87abe2f46f17224ba5d6f2b8398016cf"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fa497a8ab37784fbb20ab699c246053ac294d13fc7eb40ec007a5043ec91f8"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1a93009cb80730c9bca5d6d4665494b725b6e8e157c1cb7f2db5b4b122ea562"}, - {file = "coverage-7.4.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:690db6517f09336559dc0b5f55342df62370a48f5469fabf502db2c6d1cffcd2"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:09c3255458533cb76ef55da8cc49ffab9e33f083739c8bd4f58e79fecfe288f7"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8ce1415194b4a6bd0cdcc3a1dfbf58b63f910dcb7330fe15bdff542c56949f87"}, - {file = "coverage-7.4.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b91cbc4b195444e7e258ba27ac33769c41b94967919f10037e6355e998af255c"}, - {file = "coverage-7.4.4-cp310-cp310-win32.whl", hash = "sha256:598825b51b81c808cb6f078dcb972f96af96b078faa47af7dfcdf282835baa8d"}, - {file = "coverage-7.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:09ef9199ed6653989ebbcaacc9b62b514bb63ea2f90256e71fea3ed74bd8ff6f"}, - {file = "coverage-7.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0f9f50e7ef2a71e2fae92774c99170eb8304e3fdf9c8c3c7ae9bab3e7229c5cf"}, - {file = "coverage-7.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:623512f8ba53c422fcfb2ce68362c97945095b864cda94a92edbaf5994201083"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0513b9508b93da4e1716744ef6ebc507aff016ba115ffe8ecff744d1322a7b63"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40209e141059b9370a2657c9b15607815359ab3ef9918f0196b6fccce8d3230f"}, - {file = "coverage-7.4.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a2b2b78c78293782fd3767d53e6474582f62443d0504b1554370bde86cc8227"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:73bfb9c09951125d06ee473bed216e2c3742f530fc5acc1383883125de76d9cd"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1f384c3cc76aeedce208643697fb3e8437604b512255de6d18dae3f27655a384"}, - {file = "coverage-7.4.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:54eb8d1bf7cacfbf2a3186019bcf01d11c666bd495ed18717162f7eb1e9dd00b"}, - {file = "coverage-7.4.4-cp311-cp311-win32.whl", hash = "sha256:cac99918c7bba15302a2d81f0312c08054a3359eaa1929c7e4b26ebe41e9b286"}, - {file = "coverage-7.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:b14706df8b2de49869ae03a5ccbc211f4041750cd4a66f698df89d44f4bd30ec"}, - {file = "coverage-7.4.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:201bef2eea65e0e9c56343115ba3814e896afe6d36ffd37bab783261db430f76"}, - {file = "coverage-7.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:41c9c5f3de16b903b610d09650e5e27adbfa7f500302718c9ffd1c12cf9d6818"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d898fe162d26929b5960e4e138651f7427048e72c853607f2b200909794ed978"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ea79bb50e805cd6ac058dfa3b5c8f6c040cb87fe83de10845857f5535d1db70"}, - {file = "coverage-7.4.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce4b94265ca988c3f8e479e741693d143026632672e3ff924f25fab50518dd51"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:00838a35b882694afda09f85e469c96367daa3f3f2b097d846a7216993d37f4c"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fdfafb32984684eb03c2d83e1e51f64f0906b11e64482df3c5db936ce3839d48"}, - {file = "coverage-7.4.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:69eb372f7e2ece89f14751fbcbe470295d73ed41ecd37ca36ed2eb47512a6ab9"}, - {file = "coverage-7.4.4-cp312-cp312-win32.whl", hash = "sha256:137eb07173141545e07403cca94ab625cc1cc6bc4c1e97b6e3846270e7e1fea0"}, - {file = "coverage-7.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:d71eec7d83298f1af3326ce0ff1d0ea83c7cb98f72b577097f9083b20bdaf05e"}, - {file = "coverage-7.4.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d5ae728ff3b5401cc320d792866987e7e7e880e6ebd24433b70a33b643bb0384"}, - {file = "coverage-7.4.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cc4f1358cb0c78edef3ed237ef2c86056206bb8d9140e73b6b89fbcfcbdd40e1"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8130a2aa2acb8788e0b56938786c33c7c98562697bf9f4c7d6e8e5e3a0501e4a"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf271892d13e43bc2b51e6908ec9a6a5094a4df1d8af0bfc360088ee6c684409"}, - {file = "coverage-7.4.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4cdc86d54b5da0df6d3d3a2f0b710949286094c3a6700c21e9015932b81447e"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ae71e7ddb7a413dd60052e90528f2f65270aad4b509563af6d03d53e979feafd"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:38dd60d7bf242c4ed5b38e094baf6401faa114fc09e9e6632374388a404f98e7"}, - {file = "coverage-7.4.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:aa5b1c1bfc28384f1f53b69a023d789f72b2e0ab1b3787aae16992a7ca21056c"}, - {file = "coverage-7.4.4-cp38-cp38-win32.whl", hash = "sha256:dfa8fe35a0bb90382837b238fff375de15f0dcdb9ae68ff85f7a63649c98527e"}, - {file = "coverage-7.4.4-cp38-cp38-win_amd64.whl", hash = "sha256:b2991665420a803495e0b90a79233c1433d6ed77ef282e8e152a324bbbc5e0c8"}, - {file = "coverage-7.4.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b799445b9f7ee8bf299cfaed6f5b226c0037b74886a4e11515e569b36fe310d"}, - {file = "coverage-7.4.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b4d33f418f46362995f1e9d4f3a35a1b6322cb959c31d88ae56b0298e1c22357"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aadacf9a2f407a4688d700e4ebab33a7e2e408f2ca04dbf4aef17585389eff3e"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c95949560050d04d46b919301826525597f07b33beba6187d04fa64d47ac82e"}, - {file = "coverage-7.4.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff7687ca3d7028d8a5f0ebae95a6e4827c5616b31a4ee1192bdfde697db110d4"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5fc1de20b2d4a061b3df27ab9b7c7111e9a710f10dc2b84d33a4ab25065994ec"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c74880fc64d4958159fbd537a091d2a585448a8f8508bf248d72112723974cbd"}, - {file = "coverage-7.4.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:742a76a12aa45b44d236815d282b03cfb1de3b4323f3e4ec933acfae08e54ade"}, - {file = "coverage-7.4.4-cp39-cp39-win32.whl", hash = "sha256:d89d7b2974cae412400e88f35d86af72208e1ede1a541954af5d944a8ba46c57"}, - {file = "coverage-7.4.4-cp39-cp39-win_amd64.whl", hash = "sha256:9ca28a302acb19b6af89e90f33ee3e1906961f94b54ea37de6737b7ca9d8827c"}, - {file = "coverage-7.4.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:b2c5edc4ac10a7ef6605a966c58929ec6c1bd0917fb8c15cb3363f65aa40e677"}, - {file = "coverage-7.4.4.tar.gz", hash = "sha256:c901df83d097649e257e803be22592aedfd5182f07b3cc87d640bbb9afd50f49"}, + {file = "coverage-7.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0884920835a033b78d1c73b6d3bbcda8161a900f38a488829a83982925f6c2e"}, + {file = "coverage-7.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:39afcd3d4339329c5f58de48a52f6e4e50f6578dd6099961cf22228feb25f38f"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a7b0ceee8147444347da6a66be737c9d78f3353b0681715b668b72e79203e4a"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a9ca3f2fae0088c3c71d743d85404cec8df9be818a005ea065495bedc33da35"}, + {file = "coverage-7.5.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd215c0c7d7aab005221608a3c2b46f58c0285a819565887ee0b718c052aa4e"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4bf0655ab60d754491004a5efd7f9cccefcc1081a74c9ef2da4735d6ee4a6223"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:61c4bf1ba021817de12b813338c9be9f0ad5b1e781b9b340a6d29fc13e7c1b5e"}, + {file = "coverage-7.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:db66fc317a046556a96b453a58eced5024af4582a8dbdc0c23ca4dbc0d5b3146"}, + {file = "coverage-7.5.1-cp310-cp310-win32.whl", hash = "sha256:b016ea6b959d3b9556cb401c55a37547135a587db0115635a443b2ce8f1c7228"}, + {file = "coverage-7.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:df4e745a81c110e7446b1cc8131bf986157770fa405fe90e15e850aaf7619bc8"}, + {file = "coverage-7.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:796a79f63eca8814ca3317a1ea443645c9ff0d18b188de470ed7ccd45ae79428"}, + {file = "coverage-7.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4fc84a37bfd98db31beae3c2748811a3fa72bf2007ff7902f68746d9757f3746"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6175d1a0559986c6ee3f7fccfc4a90ecd12ba0a383dcc2da30c2b9918d67d8a3"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1fc81d5878cd6274ce971e0a3a18a8803c3fe25457165314271cf78e3aae3aa2"}, + {file = "coverage-7.5.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:556cf1a7cbc8028cb60e1ff0be806be2eded2daf8129b8811c63e2b9a6c43bca"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9981706d300c18d8b220995ad22627647be11a4276721c10911e0e9fa44c83e8"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d7fed867ee50edf1a0b4a11e8e5d0895150e572af1cd6d315d557758bfa9c057"}, + {file = "coverage-7.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ef48e2707fb320c8f139424a596f5b69955a85b178f15af261bab871873bb987"}, + {file = "coverage-7.5.1-cp311-cp311-win32.whl", hash = "sha256:9314d5678dcc665330df5b69c1e726a0e49b27df0461c08ca12674bcc19ef136"}, + {file = "coverage-7.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:5fa567e99765fe98f4e7d7394ce623e794d7cabb170f2ca2ac5a4174437e90dd"}, + {file = "coverage-7.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b6cf3764c030e5338e7f61f95bd21147963cf6aa16e09d2f74f1fa52013c1206"}, + {file = "coverage-7.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ec92012fefebee89a6b9c79bc39051a6cb3891d562b9270ab10ecfdadbc0c34"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16db7f26000a07efcf6aea00316f6ac57e7d9a96501e990a36f40c965ec7a95d"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:beccf7b8a10b09c4ae543582c1319c6df47d78fd732f854ac68d518ee1fb97fa"}, + {file = "coverage-7.5.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8748731ad392d736cc9ccac03c9845b13bb07d020a33423fa5b3a36521ac6e4e"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7352b9161b33fd0b643ccd1f21f3a3908daaddf414f1c6cb9d3a2fd618bf2572"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:7a588d39e0925f6a2bff87154752481273cdb1736270642aeb3635cb9b4cad07"}, + {file = "coverage-7.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:68f962d9b72ce69ea8621f57551b2fa9c70509af757ee3b8105d4f51b92b41a7"}, + {file = "coverage-7.5.1-cp312-cp312-win32.whl", hash = "sha256:f152cbf5b88aaeb836127d920dd0f5e7edff5a66f10c079157306c4343d86c19"}, + {file = "coverage-7.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:5a5740d1fb60ddf268a3811bcd353de34eb56dc24e8f52a7f05ee513b2d4f596"}, + {file = "coverage-7.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e2213def81a50519d7cc56ed643c9e93e0247f5bbe0d1247d15fa520814a7cd7"}, + {file = "coverage-7.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5037f8fcc2a95b1f0e80585bd9d1ec31068a9bcb157d9750a172836e98bc7a90"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c3721c2c9e4c4953a41a26c14f4cef64330392a6d2d675c8b1db3b645e31f0e"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca498687ca46a62ae590253fba634a1fe9836bc56f626852fb2720f334c9e4e5"}, + {file = "coverage-7.5.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cdcbc320b14c3e5877ee79e649677cb7d89ef588852e9583e6b24c2e5072661"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:57e0204b5b745594e5bc14b9b50006da722827f0b8c776949f1135677e88d0b8"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8fe7502616b67b234482c3ce276ff26f39ffe88adca2acf0261df4b8454668b4"}, + {file = "coverage-7.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9e78295f4144f9dacfed4f92935fbe1780021247c2fabf73a819b17f0ccfff8d"}, + {file = "coverage-7.5.1-cp38-cp38-win32.whl", hash = "sha256:1434e088b41594baa71188a17533083eabf5609e8e72f16ce8c186001e6b8c41"}, + {file = "coverage-7.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:0646599e9b139988b63704d704af8e8df7fa4cbc4a1f33df69d97f36cb0a38de"}, + {file = "coverage-7.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4cc37def103a2725bc672f84bd939a6fe4522310503207aae4d56351644682f1"}, + {file = "coverage-7.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fc0b4d8bfeabd25ea75e94632f5b6e047eef8adaed0c2161ada1e922e7f7cece"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d0a0f5e06881ecedfe6f3dd2f56dcb057b6dbeb3327fd32d4b12854df36bf26"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9735317685ba6ec7e3754798c8871c2f49aa5e687cc794a0b1d284b2389d1bd5"}, + {file = "coverage-7.5.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d21918e9ef11edf36764b93101e2ae8cc82aa5efdc7c5a4e9c6c35a48496d601"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c3e757949f268364b96ca894b4c342b41dc6f8f8b66c37878aacef5930db61be"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:79afb6197e2f7f60c4824dd4b2d4c2ec5801ceb6ba9ce5d2c3080e5660d51a4f"}, + {file = "coverage-7.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d1d0d98d95dd18fe29dc66808e1accf59f037d5716f86a501fc0256455219668"}, + {file = "coverage-7.5.1-cp39-cp39-win32.whl", hash = "sha256:1cc0fe9b0b3a8364093c53b0b4c0c2dd4bb23acbec4c9240b5f284095ccf7981"}, + {file = "coverage-7.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:dde0070c40ea8bb3641e811c1cfbf18e265d024deff6de52c5950677a8fb1e0f"}, + {file = "coverage-7.5.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:6537e7c10cc47c595828b8a8be04c72144725c383c4702703ff4e42e44577312"}, + {file = "coverage-7.5.1.tar.gz", hash = "sha256:54de9ef3a9da981f7af93eafde4ede199e0846cd819eb27c88e2b712aae9708c"}, ] [package.dependencies] @@ -159,13 +159,13 @@ toml = ["tomli"] [[package]] name = "exceptiongroup" -version = "1.2.0" +version = "1.2.1" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, - {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, + {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, + {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, ] [package.extras] @@ -173,13 +173,13 @@ test = ["pytest (>=6)"] [[package]] name = "filelock" -version = "3.13.3" +version = "3.14.0" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.13.3-py3-none-any.whl", hash = "sha256:5ffa845303983e7a0b7ae17636509bc97997d58afeafa72fb141a17b152284cb"}, - {file = "filelock-3.13.3.tar.gz", hash = "sha256:a79895a25bbefdf55d1a2a0a80968f7dbb28edcd6d4234a0afb3f37ecde4b546"}, + {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"}, + {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"}, ] [package.extras] @@ -233,6 +233,20 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "intel-openmp" +version = "2021.4.0" +description = "Intel OpenMP* Runtime Library" +optional = false +python-versions = "*" +files = [ + {file = "intel_openmp-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:41c01e266a7fdb631a7609191709322da2bbf24b252ba763f125dd651bcc7675"}, + {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:3b921236a38384e2016f0f3d65af6732cf2c12918087128a9163225451e776f2"}, + {file = "intel_openmp-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:e2240ab8d01472fed04f3544a878cda5da16c26232b7ea1b59132dbfb48b186e"}, + {file = "intel_openmp-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:6e863d8fd3d7e8ef389d52cf97a50fe2afe1a19247e8c0d168ce021546f96fc9"}, + {file = "intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:eef4c8bcc8acefd7f5cd3b9384dbf73d59e2c99fc56545712ded913f43c4a94f"}, +] + [[package]] name = "isort" version = "5.13.2" @@ -333,6 +347,24 @@ files = [ {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, ] +[[package]] +name = "mkl" +version = "2021.4.0" +description = "Intel® oneAPI Math Kernel Library" +optional = false +python-versions = "*" +files = [ + {file = "mkl-2021.4.0-py2.py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.whl", hash = "sha256:67460f5cd7e30e405b54d70d1ed3ca78118370b65f7327d495e9c8847705e2fb"}, + {file = "mkl-2021.4.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:636d07d90e68ccc9630c654d47ce9fdeb036bb46e2b193b3a9ac8cfea683cce5"}, + {file = "mkl-2021.4.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:398dbf2b0d12acaf54117a5210e8f191827f373d362d796091d161f610c1ebfb"}, + {file = "mkl-2021.4.0-py2.py3-none-win32.whl", hash = "sha256:439c640b269a5668134e3dcbcea4350459c4a8bc46469669b2d67e07e3d330e8"}, + {file = "mkl-2021.4.0-py2.py3-none-win_amd64.whl", hash = "sha256:ceef3cafce4c009dd25f65d7ad0d833a0fbadc3d8903991ec92351fe5de1e718"}, +] + +[package.dependencies] +intel-openmp = "==2021.*" +tbb = "==2021.*" + [[package]] name = "mpmath" version = "1.3.0" @@ -440,28 +472,29 @@ files = [ [[package]] name = "platformdirs" -version = "4.2.0" -description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +version = "4.2.1" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"}, - {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"}, + {file = "platformdirs-4.2.1-py3-none-any.whl", hash = "sha256:17d5a1161b3fd67b390023cb2d3b026bbd40abde6fdb052dfbd3a29c3ba22ee1"}, + {file = "platformdirs-4.2.1.tar.gz", hash = "sha256:031cd18d4ec63ec53e82dceaac0417d218a6863f7745dfcc9efe7793b7039bdf"}, ] [package.extras] docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] +type = ["mypy (>=1.8)"] [[package]] name = "pluggy" -version = "1.4.0" +version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" files = [ - {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, - {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, ] [package.extras] @@ -470,13 +503,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "pytest" -version = "8.1.1" +version = "8.2.0" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.1.1-py3-none-any.whl", hash = "sha256:2a8386cfc11fa9d2c50ee7b2a57e7d898ef90470a7a34c4b949ff59662bb78b7"}, - {file = "pytest-8.1.1.tar.gz", hash = "sha256:ac978141a75948948817d360297b7aae0fcb9d6ff6bc9ec6d514b85d5a65c044"}, + {file = "pytest-8.2.0-py3-none-any.whl", hash = "sha256:1733f0620f6cda4095bbf0d9ff8022486e91892245bb9e7d5542c018f612f233"}, + {file = "pytest-8.2.0.tar.gz", hash = "sha256:d507d4482197eac0ba2bae2e9babf0672eb333017bcedaa5fb1a3d42c1174b3f"}, ] [package.dependencies] @@ -484,11 +517,11 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.4,<2.0" +pluggy = ">=1.5,<2.0" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-cov" @@ -510,28 +543,28 @@ testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] [[package]] name = "ruff" -version = "0.3.5" +version = "0.4.3" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.3.5-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:aef5bd3b89e657007e1be6b16553c8813b221ff6d92c7526b7e0227450981eac"}, - {file = "ruff-0.3.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:89b1e92b3bd9fca249153a97d23f29bed3992cff414b222fcd361d763fc53f12"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e55771559c89272c3ebab23326dc23e7f813e492052391fe7950c1a5a139d89"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dabc62195bf54b8a7876add6e789caae0268f34582333cda340497c886111c39"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a05f3793ba25f194f395578579c546ca5d83e0195f992edc32e5907d142bfa3"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:dfd3504e881082959b4160ab02f7a205f0fadc0a9619cc481982b6837b2fd4c0"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:87258e0d4b04046cf1d6cc1c56fadbf7a880cc3de1f7294938e923234cf9e498"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:712e71283fc7d9f95047ed5f793bc019b0b0a29849b14664a60fd66c23b96da1"}, - {file = "ruff-0.3.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a532a90b4a18d3f722c124c513ffb5e5eaff0cc4f6d3aa4bda38e691b8600c9f"}, - {file = "ruff-0.3.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:122de171a147c76ada00f76df533b54676f6e321e61bd8656ae54be326c10296"}, - {file = "ruff-0.3.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d80a6b18a6c3b6ed25b71b05eba183f37d9bc8b16ace9e3d700997f00b74660b"}, - {file = "ruff-0.3.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a7b6e63194c68bca8e71f81de30cfa6f58ff70393cf45aab4c20f158227d5936"}, - {file = "ruff-0.3.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a759d33a20c72f2dfa54dae6e85e1225b8e302e8ac655773aff22e542a300985"}, - {file = "ruff-0.3.5-py3-none-win32.whl", hash = "sha256:9d8605aa990045517c911726d21293ef4baa64f87265896e491a05461cae078d"}, - {file = "ruff-0.3.5-py3-none-win_amd64.whl", hash = "sha256:dc56bb16a63c1303bd47563c60482a1512721053d93231cf7e9e1c6954395a0e"}, - {file = "ruff-0.3.5-py3-none-win_arm64.whl", hash = "sha256:faeeae9905446b975dcf6d4499dc93439b131f1443ee264055c5716dd947af55"}, - {file = "ruff-0.3.5.tar.gz", hash = "sha256:a067daaeb1dc2baf9b82a32dae67d154d95212080c80435eb052d95da647763d"}, + {file = "ruff-0.4.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b70800c290f14ae6fcbb41bbe201cf62dfca024d124a1f373e76371a007454ce"}, + {file = "ruff-0.4.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:08a0d6a22918ab2552ace96adeaca308833873a4d7d1d587bb1d37bae8728eb3"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba1f14df3c758dd7de5b55fbae7e1c8af238597961e5fb628f3de446c3c40c5"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:819fb06d535cc76dfddbfe8d3068ff602ddeb40e3eacbc90e0d1272bb8d97113"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bfc9e955e6dc6359eb6f82ea150c4f4e82b660e5b58d9a20a0e42ec3bb6342b"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:510a67d232d2ebe983fddea324dbf9d69b71c4d2dfeb8a862f4a127536dd4cfb"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc9ff11cd9a092ee7680a56d21f302bdda14327772cd870d806610a3503d001f"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29efff25bf9ee685c2c8390563a5b5c006a3fee5230d28ea39f4f75f9d0b6f2f"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18b00e0bcccf0fc8d7186ed21e311dffd19761cb632241a6e4fe4477cc80ef6e"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:262f5635e2c74d80b7507fbc2fac28fe0d4fef26373bbc62039526f7722bca1b"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7363691198719c26459e08cc17c6a3dac6f592e9ea3d2fa772f4e561b5fe82a3"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:eeb039f8428fcb6725bb63cbae92ad67b0559e68b5d80f840f11914afd8ddf7f"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:927b11c1e4d0727ce1a729eace61cee88a334623ec424c0b1c8fe3e5f9d3c865"}, + {file = "ruff-0.4.3-py3-none-win32.whl", hash = "sha256:25cacda2155778beb0d064e0ec5a3944dcca9c12715f7c4634fd9d93ac33fd30"}, + {file = "ruff-0.4.3-py3-none-win_amd64.whl", hash = "sha256:7a1c3a450bc6539ef00da6c819fb1b76b6b065dec585f91456e7c0d6a0bbc725"}, + {file = "ruff-0.4.3-py3-none-win_arm64.whl", hash = "sha256:71ca5f8ccf1121b95a59649482470c5601c60a416bf189d553955b0338e34614"}, + {file = "ruff-0.4.3.tar.gz", hash = "sha256:ff0a3ef2e3c4b6d133fbedcf9586abfbe38d076041f2dc18ffb2c7e0485d5a07"}, ] [[package]] @@ -548,6 +581,19 @@ files = [ [package.dependencies] mpmath = ">=0.19" +[[package]] +name = "tbb" +version = "2021.12.0" +description = "Intel® oneAPI Threading Building Blocks (oneTBB)" +optional = false +python-versions = "*" +files = [ + {file = "tbb-2021.12.0-py2.py3-none-manylinux1_i686.whl", hash = "sha256:f2cc9a7f8ababaa506cbff796ce97c3bf91062ba521e15054394f773375d81d8"}, + {file = "tbb-2021.12.0-py2.py3-none-manylinux1_x86_64.whl", hash = "sha256:a925e9a7c77d3a46ae31c34b0bb7f801c4118e857d137b68f68a8e458fcf2bd7"}, + {file = "tbb-2021.12.0-py3-none-win32.whl", hash = "sha256:b1725b30c174048edc8be70bd43bb95473f396ce895d91151a474d0fa9f450a8"}, + {file = "tbb-2021.12.0-py3-none-win_amd64.whl", hash = "sha256:fc2772d850229f2f3df85f1109c4844c495a2db7433d38200959ee9265b34789"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -561,27 +607,28 @@ files = [ [[package]] name = "torch" -version = "2.2.2+cpu" +version = "2.3.0+cpu" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = false python-versions = ">=3.8.0" files = [ - {file = "torch-2.2.2+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:02c4fac3c964e73f5f49003e0060c697f73b67c10cc23f51c592facb29e1bd53"}, - {file = "torch-2.2.2+cpu-cp310-cp310-win_amd64.whl", hash = "sha256:fc29dda2795dd7220d769c5926b1c50ddac9b4827897e30a10467063691cdf54"}, - {file = "torch-2.2.2+cpu-cp311-cp311-linux_x86_64.whl", hash = "sha256:90089cae572672fb449c8ff1dc1b29daaffa117bf97ede7463dcd2fd1b991e4c"}, - {file = "torch-2.2.2+cpu-cp311-cp311-win_amd64.whl", hash = "sha256:88e63c916e3275fa30a220ee736423a95573b96072ded85e5c0171fd8f37a755"}, - {file = "torch-2.2.2+cpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:431a747b5a880cf8e1fb6d58db6bfafa6768cbec76517d046854537c03323edf"}, - {file = "torch-2.2.2+cpu-cp312-cp312-win_amd64.whl", hash = "sha256:2b0cf041f878607a361116945f82ce2dba4b7a747151da7619a63cb5fccb72df"}, - {file = "torch-2.2.2+cpu-cp38-cp38-linux_x86_64.whl", hash = "sha256:8914ce932168e572a09b4a7e5b0806d279f771dfe58d7e1d8de2291fac4ce69b"}, - {file = "torch-2.2.2+cpu-cp38-cp38-win_amd64.whl", hash = "sha256:4ef2911ffde6d86f643c23aa99f25f1a1df8bee93bf8d0c69cf1b9ba0ca521dc"}, - {file = "torch-2.2.2+cpu-cp39-cp39-linux_x86_64.whl", hash = "sha256:6e3d323a21df22415770e88d39e13591079b9356dabb8b394d1ee29ac6c92481"}, - {file = "torch-2.2.2+cpu-cp39-cp39-win_amd64.whl", hash = "sha256:c2c9e7d5e3c7d58e4b78d6aebfa8002af7cda16cde08d0e3ed00300dc21a8efc"}, + {file = "torch-2.3.0+cpu-cp310-cp310-linux_x86_64.whl", hash = "sha256:e3c220702d82c7596924150e0499fbbffcf62a88a59adc860fa357cd8dc1c302"}, + {file = "torch-2.3.0+cpu-cp310-cp310-win_amd64.whl", hash = "sha256:ab0c05525195b8fecdf2ea75968ed32ccd87dff16381b6e13249babb4a9596ff"}, + {file = "torch-2.3.0+cpu-cp311-cp311-linux_x86_64.whl", hash = "sha256:97a38b25ee0e3d020691e7846efbca62a3d8a57645c027dcb5ba0adfec36fe55"}, + {file = "torch-2.3.0+cpu-cp311-cp311-win_amd64.whl", hash = "sha256:a8ac195974be6f067245bae8156b8c06fb0a723b0eed8f2e244b5dd58c7e2a49"}, + {file = "torch-2.3.0+cpu-cp312-cp312-linux_x86_64.whl", hash = "sha256:a8982e52185771591dad577a124a7770f72f288f8ae5833317b1e329c0d2f07e"}, + {file = "torch-2.3.0+cpu-cp312-cp312-win_amd64.whl", hash = "sha256:483131a7997995d867313ee902743084e844e830ab2a0c5e079c61ec2da3cd17"}, + {file = "torch-2.3.0+cpu-cp38-cp38-linux_x86_64.whl", hash = "sha256:8c52484880d5fbe511cffc255dd34847ddeced3f94334c6bf7eb2b0445f10cb4"}, + {file = "torch-2.3.0+cpu-cp38-cp38-win_amd64.whl", hash = "sha256:28a11bcc0d709b397d675cff689707019b8cc122e6bf328b57b900f47c36f156"}, + {file = "torch-2.3.0+cpu-cp39-cp39-linux_x86_64.whl", hash = "sha256:1e86e225e472392440ace378ba3165b5e87648e8b5fbf16adc41c0df881c38b8"}, + {file = "torch-2.3.0+cpu-cp39-cp39-win_amd64.whl", hash = "sha256:5c2afdff80203eaabf4c223a294c2f465020b3360e8e87f76b52ace9c5801ebe"}, ] [package.dependencies] filelock = "*" fsspec = "*" jinja2 = "*" +mkl = {version = ">=2021.1.1,<=2021.4.0", markers = "platform_system == \"Windows\""} networkx = "*" sympy = "*" typing-extensions = ">=4.8.0" diff --git a/pyproject.toml b/pyproject.toml index 72654dbe5..eede72a0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pytorch_optimizer" -version = "2.12.0" +version = "3.0.0" description = "optimizer & lr scheduler & objective function collections in PyTorch" license = "Apache-2.0" authors = ["kozistr "] @@ -12,7 +12,7 @@ documentation = "https://pytorch-optimizers.readthedocs.io/en/latest" keywords = [ "pytorch", "deep-learning", "optimizer", "lr scheduler", "A2Grad", "ASGD", "AccSGD", "AdaBelief", "AdaBound", "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "Adalite", - "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "CAME", "DAdaptAdaGrad", + "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "Fromage", "GaLore", "Gravity", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "SGDP", "Shampoo", "ScalableShampoo", @@ -50,7 +50,7 @@ bitsandbytes = { version = "^0.43", optional = true } [tool.poetry.dev-dependencies] isort = { version = "^5", python = ">=3.8" } -black = { version = "^24", python = ">=3.8"} +black = { version = "^24", python = ">=3.8" } ruff = "*" pytest = "*" pytest-cov = "*" diff --git a/requirements-dev.txt b/requirements-dev.txt index 6ed086a44..3f37e1960 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,28 +1,31 @@ --extra-index-url https://download.pytorch.org/whl/cpu -black==24.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0" +black==24.4.2 ; python_version >= "3.8" and python_full_version < "4.0.0" click==8.1.7 ; python_version >= "3.8" and python_full_version < "4.0.0" colorama==0.4.6 ; python_version >= "3.8" and python_full_version < "4.0.0" and (sys_platform == "win32" or platform_system == "Windows") -coverage[toml]==7.4.4 ; python_version >= "3.8" and python_full_version < "4.0.0" -exceptiongroup==1.2.0 ; python_version >= "3.8" and python_version < "3.11" -filelock==3.13.3 ; python_version >= "3.8" and python_full_version < "4.0.0" +coverage[toml]==7.5.1 ; python_version >= "3.8" and python_full_version < "4.0.0" +exceptiongroup==1.2.1 ; python_version >= "3.8" and python_version < "3.11" +filelock==3.14.0 ; python_version >= "3.8" and python_full_version < "4.0.0" fsspec==2024.3.1 ; python_version >= "3.8" and python_full_version < "4.0.0" iniconfig==2.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0" +intel-openmp==2021.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" isort==5.13.2 ; python_version >= "3.8" and python_full_version < "4.0.0" jinja2==3.1.3 ; python_version >= "3.8" and python_full_version < "4.0.0" markupsafe==2.1.5 ; python_version >= "3.8" and python_full_version < "4.0.0" +mkl==2021.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" mpmath==1.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0" mypy-extensions==1.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0" networkx==3.1 ; python_version >= "3.8" and python_full_version < "4.0.0" numpy==1.24.4 ; python_version >= "3.8" and python_full_version < "4.0.0" packaging==24.0 ; python_version >= "3.8" and python_full_version < "4.0.0" pathspec==0.12.1 ; python_version >= "3.8" and python_full_version < "4.0.0" -platformdirs==4.2.0 ; python_version >= "3.8" and python_full_version < "4.0.0" -pluggy==1.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" +platformdirs==4.2.1 ; python_version >= "3.8" and python_full_version < "4.0.0" +pluggy==1.5.0 ; python_version >= "3.8" and python_full_version < "4.0.0" pytest-cov==5.0.0 ; python_version >= "3.8" and python_full_version < "4.0.0" -pytest==8.1.1 ; python_version >= "3.8" and python_full_version < "4.0.0" -ruff==0.3.5 ; python_version >= "3.8" and python_full_version < "4.0.0" +pytest==8.2.0 ; python_version >= "3.8" and python_full_version < "4.0.0" +ruff==0.4.3 ; python_version >= "3.8" and python_full_version < "4.0.0" sympy==1.12 ; python_version >= "3.8" and python_full_version < "4.0.0" +tbb==2021.12.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" tomli==2.0.1 ; python_version >= "3.8" and python_full_version <= "3.11.0a6" -torch==2.2.2+cpu ; python_version >= "3.8" and python_full_version < "4.0.0" +torch==2.3.0+cpu ; python_version >= "3.8" and python_full_version < "4.0.0" typing-extensions==4.11.0 ; python_version >= "3.8" and python_full_version < "4.0.0" diff --git a/requirements.txt b/requirements.txt index fe27e8f9a..f54d84a9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,15 @@ --extra-index-url https://download.pytorch.org/whl/cpu -filelock==3.13.3 ; python_version >= "3.8" and python_full_version < "4.0.0" +filelock==3.14.0 ; python_version >= "3.8" and python_full_version < "4.0.0" fsspec==2024.3.1 ; python_version >= "3.8" and python_full_version < "4.0.0" +intel-openmp==2021.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" jinja2==3.1.3 ; python_version >= "3.8" and python_full_version < "4.0.0" markupsafe==2.1.5 ; python_version >= "3.8" and python_full_version < "4.0.0" +mkl==2021.4.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" mpmath==1.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0" networkx==3.1 ; python_version >= "3.8" and python_full_version < "4.0.0" numpy==1.24.4 ; python_version >= "3.8" and python_full_version < "4.0.0" sympy==1.12 ; python_version >= "3.8" and python_full_version < "4.0.0" -torch==2.2.2+cpu ; python_version >= "3.8" and python_full_version < "4.0.0" +tbb==2021.12.0 ; python_version >= "3.8" and python_full_version < "4.0.0" and platform_system == "Windows" +torch==2.3.0+cpu ; python_version >= "3.8" and python_full_version < "4.0.0" typing-extensions==4.11.0 ; python_version >= "3.8" and python_full_version < "4.0.0" From f02fa47f9dc5ea9cd03b2e07b48734233481de5c Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:03:11 +0900 Subject: [PATCH 07/28] update: bSAM optimizer --- pytorch_optimizer/__init__.py | 3 ++- pytorch_optimizer/optimizer/sam.py | 9 ++++++++- tests/constants.py | 2 ++ tests/test_load_modules.py | 2 +- tests/test_optimizers.py | 27 +++++++++++++++++++++++++++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py index 6c3b1c2c3..e23749831 100644 --- a/pytorch_optimizer/__init__.py +++ b/pytorch_optimizer/__init__.py @@ -79,7 +79,7 @@ from pytorch_optimizer.optimizer.ranger import Ranger from pytorch_optimizer.optimizer.ranger21 import Ranger21 from pytorch_optimizer.optimizer.rotograd import RotoGrad -from pytorch_optimizer.optimizer.sam import GSAM, SAM, WSAM +from pytorch_optimizer.optimizer.sam import GSAM, SAM, WSAM, BSAM from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD, SignSGD from pytorch_optimizer.optimizer.sgdp import SGDP from pytorch_optimizer.optimizer.shampoo import ScalableShampoo, Shampoo @@ -186,6 +186,7 @@ Aida, GaLore, Adalite, + BSAM, ] OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST} diff --git a/pytorch_optimizer/optimizer/sam.py b/pytorch_optimizer/optimizer/sam.py index 22e2a9041..030a8e750 100644 --- a/pytorch_optimizer/optimizer/sam.py +++ b/pytorch_optimizer/optimizer/sam.py @@ -633,6 +633,11 @@ def first_step(self): state = self.state[p] + if 's' not in state: + state['s'] = torch.ones_like(p) + state['noisy_gradient'] = torch.zeros_like(p.grad) + state['momentum'] = torch.zeros_like(p) + noise = torch.normal(0.0, 1 / (self.num_data * state['s'])) p.add_(noise) @@ -684,6 +689,8 @@ def step(self, closure: CLOSURE = None): self.second_step() with torch.enable_grad(): - closure() + loss = closure() self.third_step() + + return loss diff --git a/tests/constants.py b/tests/constants.py index 4b322bdec..355ba5451 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -64,6 +64,7 @@ SophiaH, Tiger, Yogi, + BSAM, ) from tests.utils import build_lookahead @@ -123,6 +124,7 @@ 'aida', 'galore', 'adalite', + 'bsam', ] VALID_LR_SCHEDULER_NAMES: List[str] = [ diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py index e6662b13c..5d899ae5c 100644 --- a/tests/test_load_modules.py +++ b/tests/test_load_modules.py @@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names): def test_get_supported_optimizers(): - assert len(get_supported_optimizers()) == 63 + assert len(get_supported_optimizers()) == 64 def test_get_supported_lr_schedulers(): diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 0546f8fd1..df379bf44 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -6,6 +6,7 @@ from pytorch_optimizer import ( GSAM, SAM, + BSAM, WSAM, CosineScheduler, DynamicLossScaler, @@ -236,6 +237,32 @@ def test_gsam_optimizer(adaptive, environment): assert tensor_to_numpy(init_loss) > 1.2 * tensor_to_numpy(loss) +@pytest.mark.parametrize('adaptive', ADAPTIVE_FLAGS) +def test_bsam_optimizer(adaptive, environment): + (x_data, y_data), model, loss_fn = environment + + optimizer = BSAM(model.parameters(), lr=2e-3, num_data=len(x_data), rho=1e-5, adaptive=adaptive) + optimizer.reset() + + def closure(): + first_loss = loss_fn(y_data, model(x_data)) + first_loss.backward() + return first_loss + + init_loss, loss = np.inf, np.inf + for _ in range(20): + loss = loss_fn(y_data, model(x_data)) + loss.backward() + + optimizer.step(closure) + optimizer.zero_grad() + + if init_loss == np.inf: + init_loss = loss + + assert tensor_to_numpy(init_loss) > tensor_to_numpy(loss) + + @pytest.mark.parametrize('optimizer_config', ADANORM_SUPPORTED_OPTIMIZERS, ids=ids) def test_adanorm_optimizer(optimizer_config, environment): (x_data, y_data), model, loss_fn = environment From d1078ab204bbccbcb6853517b75566b45f9b756d Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:05:22 +0900 Subject: [PATCH 08/28] update: bSAM optimizer --- pytorch_optimizer/__init__.py | 2 +- pytorch_optimizer/optimizer/sam.py | 23 +---------------------- tests/constants.py | 1 - tests/test_optimizers.py | 2 +- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py index e23749831..ce489d12c 100644 --- a/pytorch_optimizer/__init__.py +++ b/pytorch_optimizer/__init__.py @@ -79,7 +79,7 @@ from pytorch_optimizer.optimizer.ranger import Ranger from pytorch_optimizer.optimizer.ranger21 import Ranger21 from pytorch_optimizer.optimizer.rotograd import RotoGrad -from pytorch_optimizer.optimizer.sam import GSAM, SAM, WSAM, BSAM +from pytorch_optimizer.optimizer.sam import BSAM, GSAM, SAM, WSAM from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD, SignSGD from pytorch_optimizer.optimizer.sgdp import SGDP from pytorch_optimizer.optimizer.shampoo import ScalableShampoo, Shampoo diff --git a/pytorch_optimizer/optimizer/sam.py b/pytorch_optimizer/optimizer/sam.py index 030a8e750..eec1b8cc6 100644 --- a/pytorch_optimizer/optimizer/sam.py +++ b/pytorch_optimizer/optimizer/sam.py @@ -538,28 +538,6 @@ class BSAM(Optimizer, BaseOptimizer): model = YourModel() optimizer = BSAM(model.parameters(), ...) - for input, output in data: - # first forward-backward pass - - loss = loss_function(output, model(input)) - loss.backward() - optimizer.step(zero_grad=True) - - # second forward-backward pass - # make sure to do a full forward pass - loss_function(output, model(input)).backward() - optimizer.second_step(zero_grad=True) - - # third forward-backward pass - # make sure to do a full forward pass - loss_function(output, model(input)).backward() - optimizer.second_step(zero_grad=True) - - Alternative example with a single closure-based step function:: - - model = YourModel() - optimizer = BSAM(model.parameters(), ...) - def closure(): loss = loss_function(output, model(input)) loss.backward() @@ -568,6 +546,7 @@ def closure(): for input, output in data: loss = loss_function(output, model(input)) loss.backward() + optimizer.step(closure) optimizer.zero_grad() diff --git a/tests/constants.py b/tests/constants.py index 355ba5451..d1572c195 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -64,7 +64,6 @@ SophiaH, Tiger, Yogi, - BSAM, ) from tests.utils import build_lookahead diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index df379bf44..c8064aa51 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -4,9 +4,9 @@ from torch import nn from pytorch_optimizer import ( + BSAM, GSAM, SAM, - BSAM, WSAM, CosineScheduler, DynamicLossScaler, From 04456cbaf887887bff951db240d121e5c78640ea Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:05:29 +0900 Subject: [PATCH 09/28] docs: bSAM optimizer --- docs/changelogs/v3.0.0.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelogs/v3.0.0.md b/docs/changelogs/v3.0.0.md index 3e836fff5..e37dc2557 100644 --- a/docs/changelogs/v3.0.0.md +++ b/docs/changelogs/v3.0.0.md @@ -13,7 +13,7 @@ Major version is updated! (`v2.12.0` -> `v3.0.0`) (#164) * Implement `GaLore` optimizer. (#224, #228) * [Memory-Efficient LLM Training by Gradient Low-Rank Projection](https://arxiv.org/abs/2403.03507) * Implement `Adalite` optimizer. (#225, #229) -* Implement `bSAM` optimizer. (#233) +* Implement `bSAM` optimizer. (#212, #233) * [SAM as an Optimal Relaxation of Bayes](https://arxiv.org/abs/2210.01620) ### Fix From c77856826e766dca8e35089f5f32303d4664092e Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:11:35 +0900 Subject: [PATCH 10/28] fix: bSAM cases --- tests/test_general_optimizer_parameters.py | 8 ++++++++ tests/test_gradients.py | 6 ++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py index 19266f098..58a3c2b5e 100644 --- a/tests/test_general_optimizer_parameters.py +++ b/tests/test_general_optimizer_parameters.py @@ -16,6 +16,8 @@ def test_learning_rate(optimizer_name): config = {'lr': -1e-2} if optimizer_name == 'ranger21': config.update({'num_iterations': 100}) + elif optimizer_name == 'bsam': + config.update({'num_data': 100}) with pytest.raises(NegativeLRError): optimizer(None, **config) @@ -47,6 +49,7 @@ def test_epsilon(optimizer_name): 'tiger', 'came', 'adalite', + 'bsam', ): pytest.skip(f'skip {optimizer_name} optimizer') @@ -82,6 +85,8 @@ def test_weight_decay(optimizer_name): config = {'weight_decay': -1e-3} if optimizer_name == 'ranger21': config.update({'num_iterations': 100}) + elif optimizer_name == 'bsam': + config.update({'num_data': 100}) with pytest.raises(ValueError): optimizer(None, **config) @@ -124,6 +129,9 @@ def test_betas(optimizer_name): if optimizer_name == 'ranger21': config1.update({'num_iterations': 100}) config2.update({'num_iterations': 100}) + elif optimizer_name == 'bsam': + config1.update({'num_data': 100}) + config2.update({'num_data': 100}) if optimizer_name not in ('adapnm', 'adan', 'adamod', 'aggmo', 'came'): with pytest.raises(ValueError): diff --git a/tests/test_gradients.py b/tests/test_gradients.py index 7fa8a594d..a1c29e15b 100644 --- a/tests/test_gradients.py +++ b/tests/test_gradients.py @@ -20,6 +20,8 @@ def test_no_gradients(optimizer_name): if optimizer_name == 'ranger21': optimizer = load_optimizer(optimizer_name)(params, num_iterations=1, lookahead_merge_time=1) + elif optimizer_name == 'bsam': + optimizer = load_optimizer(optimizer_name)(params, num_data=1) elif optimizer_name in ('lamb', 'ralamb'): optimizer = load_optimizer(optimizer_name)(params, pre_norm=True) elif optimizer_name == 'lookahead': @@ -37,7 +39,7 @@ def test_no_gradients(optimizer_name): @pytest.mark.parametrize('no_sparse_optimizer', NO_SPARSE_OPTIMIZERS) def test_sparse_not_supported(no_sparse_optimizer): - if no_sparse_optimizer == 'lomo': + if no_sparse_optimizer in ('lomo', 'bsam'): pytest.skip(f'skip {no_sparse_optimizer} optimizer.') param = simple_sparse_parameter()[1] @@ -111,7 +113,7 @@ def test_sparse_supported(sparse_optimizer): @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES) def test_bf16_gradient(optimizer_name): - if optimizer_name in ('shampoo', 'lomo'): + if optimizer_name in ('shampoo', 'lomo', 'bsam'): pytest.skip(f'skip {optimizer_name}') param = torch.randn(1, 1).bfloat16().requires_grad_(True) From 06026db314efd5e49daa62bad76a616023543bd1 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:18:19 +0900 Subject: [PATCH 11/28] update: test_no_closure --- tests/test_optimizers.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index c8064aa51..f27f82d06 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -242,7 +242,6 @@ def test_bsam_optimizer(adaptive, environment): (x_data, y_data), model, loss_fn = environment optimizer = BSAM(model.parameters(), lr=2e-3, num_data=len(x_data), rho=1e-5, adaptive=adaptive) - optimizer.reset() def closure(): first_loss = loss_fn(y_data, model(x_data)) @@ -365,8 +364,11 @@ def test_closure(optimizer): param.grad = None optimizer_name: str = optimizer.__name__ + if optimizer_name == 'Ranger21': + optimizer = optimizer([param], num_iterations=1) + else: + optimizer = optimizer([param]) - optimizer = optimizer([param], num_iterations=1) if optimizer_name == 'Ranger21' else optimizer([param]) optimizer.zero_grad() if optimizer_name in ('Ranger21', 'Adai', 'AdamS'): @@ -394,6 +396,12 @@ def test_no_closure(): with pytest.raises(NoClosureError): optimizer.step() + optimizer = BSAM([param], 1) + optimizer.zero_grad() + + with pytest.raises(NoClosureError): + optimizer.step() + def test_nero_zero_scale(): param = simple_parameter() @@ -462,6 +470,8 @@ def test_reset(optimizer_config): optimizer_class, config, _ = optimizer_config if optimizer_class.__name__ == 'Ranger21': config.update({'num_iterations': 1}) + elif optimizer_class.__name__ == 'bSAM': + config.update({'num_data': 1}) optimizer = optimizer_class([simple_parameter()], **config) optimizer.reset() From ae7dbed0a49a6868cb9939df216d10303068ab70 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:18:33 +0900 Subject: [PATCH 12/28] update: reset --- pytorch_optimizer/optimizer/sam.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_optimizer/optimizer/sam.py b/pytorch_optimizer/optimizer/sam.py index eec1b8cc6..4c820e5a4 100644 --- a/pytorch_optimizer/optimizer/sam.py +++ b/pytorch_optimizer/optimizer/sam.py @@ -594,9 +594,6 @@ def __str__(self) -> str: def reset(self): for group in self.param_groups: for p in group['params']: - if p.grad is None: - continue - state = self.state[p] state['s'] = torch.ones_like(p) From 546011eb433c21e48bc478d7ac071d77b5bf43a6 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:19:16 +0900 Subject: [PATCH 13/28] style: fix SIM108 --- tests/test_optimizers.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index f27f82d06..042fd638f 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -364,10 +364,7 @@ def test_closure(optimizer): param.grad = None optimizer_name: str = optimizer.__name__ - if optimizer_name == 'Ranger21': - optimizer = optimizer([param], num_iterations=1) - else: - optimizer = optimizer([param]) + optimizer = optimizer([param], num_iterations=1) if optimizer_name == 'Ranger21' else optimizer([param]) optimizer.zero_grad() From 5a5cc99ff4a48a757cc46e8f8253b86fa45c0ad7 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:22:37 +0900 Subject: [PATCH 14/28] fix: typo --- tests/test_optimizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 042fd638f..c543c7442 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -467,7 +467,7 @@ def test_reset(optimizer_config): optimizer_class, config, _ = optimizer_config if optimizer_class.__name__ == 'Ranger21': config.update({'num_iterations': 1}) - elif optimizer_class.__name__ == 'bSAM': + elif optimizer_class.__name__ == 'BSAM': config.update({'num_data': 1}) optimizer = optimizer_class([simple_parameter()], **config) From 33239c9d1e20552de8d7b7140850b14adaceb640 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:48:45 +0900 Subject: [PATCH 15/28] docs: ScheduleFree optimizers --- README.md | 143 +++++++++++++++++++++++----------------------- docs/index.md | 143 +++++++++++++++++++++++----------------------- docs/optimizer.md | 8 +++ 3 files changed, 152 insertions(+), 142 deletions(-) diff --git a/README.md b/README.md index a187669ff..a78dd76fa 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **65 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! +Currently, **67 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -93,76 +93,77 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|--------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | -| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | ## Supported LR Scheduler diff --git a/docs/index.md b/docs/index.md index a187669ff..a78dd76fa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch. I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas. -Currently, **65 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! +Currently, **67 optimizers (+ `bitsandbytes`)**, **11 lr schedulers**, and **13 loss functions** are supported! Highly inspired by [pytorch-optimizer](https://github.com/jettify/pytorch-optimizer). @@ -93,76 +93,77 @@ from pytorch_optimizer import get_supported_optimizers supported_optimizers = get_supported_optimizers() ``` -| Optimizer | Description | Official Code | Paper | Citation | -|--------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | -| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | -| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | -| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | -| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | -| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | -| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | -| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | -| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | -| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | -| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | -| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | -| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | -| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | -| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | -| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | -| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | -| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | -| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | -| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | -| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | -| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | -| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | -| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | -| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | -| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | -| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | -| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | -| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | -| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | -| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | -| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | -| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | -| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | -| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | -| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | -| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | -| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | -| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | -| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | -| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | -| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | -| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | -| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | -| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | -| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | -| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | -| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | -| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | -| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | -| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | -| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | -| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | -| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | -| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | -| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | -| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | -| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | -| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | -| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | -| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | -| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | -| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | -| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | -| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | -| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | -| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | -| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Optimizer | Description | Official Code | Paper | Citation | +|---------------|---------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------| +| AdaBelief | *Adapting Step-sizes by the Belief in Observed Gradients* | [github](https://github.com/juntang-zhuang/Adabelief-Optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201007468Z/exportcitation) | +| AdaBound | *Adaptive Gradient Methods with Dynamic Bound of Learning Rate* | [github](https://github.com/Luolc/AdaBound/blob/master/adabound/adabound.py) | | [cite](https://github.com/Luolc/AdaBound#citing) | +| AdaHessian | *An Adaptive Second Order Optimizer for Machine Learning* | [github](https://github.com/amirgholami/adahessian) | | [cite](https://github.com/amirgholami/adahessian#citation) | +| AdamD | *Improved bias-correction in Adam* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv211010828S/exportcitation) | +| AdamP | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights* | [github](https://github.com/clovaai/AdamP) | | [cite](https://github.com/clovaai/AdamP#how-to-cite) | +| diffGrad | *An Optimization Method for Convolutional Neural Networks* | [github](https://github.com/shivram1987/diffGrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190911015D/exportcitation) | +| MADGRAD | *A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic* | [github](https://github.com/facebookresearch/madgrad) | | [cite](https://github.com/facebookresearch/madgrad#tech-report) | +| RAdam | *On the Variance of the Adaptive Learning Rate and Beyond* | [github](https://github.com/LiyuanLucasLiu/RAdam) | | [cite](https://github.com/LiyuanLucasLiu/RAdam#citation) | +| Ranger | *a synergistic optimizer combining RAdam and LookAhead, and now GC in one optimizer* | [github](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer) | | [cite](https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer#citing-this-work) | +| Ranger21 | *a synergistic deep learning optimizer* | [github](https://github.com/lessw2020/Ranger21) | | [cite](https://github.com/lessw2020/Ranger21#referencing-this-work) | +| Lamb | *Large Batch Optimization for Deep Learning* | [github](https://github.com/cybertronai/pytorch-lamb) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190400962Y/exportcitation) | +| Shampoo | *Preconditioned Stochastic Tensor Optimization* | [github](https://github.com/moskomule/shampoo.pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180209568G/exportcitation) | +| Nero | *Learning by Turning: Neural Architecture Aware Optimisation* | [github](https://github.com/jxbz/nero) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210207227L/exportcitation) | +| Adan | *Adaptive Nesterov Momentum Algorithm for Faster Optimizing Deep Models* | [github](https://github.com/sail-sg/Adan) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220806677X/exportcitation) | +| Adai | *Disentangling the Effects of Adaptive Learning Rate and Momentum* | [github](https://github.com/zeke-xie/adaptive-inertia-adai) | | [cite](https://github.com/zeke-xie/adaptive-inertia-adai#citing) | +| SAM | *Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201001412F/exportcitation) | +| ASAM | *Adaptive Sharpness-Aware Minimization* | [github](https://github.com/davda54/sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210211600K/exportcitation) | +| GSAM | *Surrogate Gap Guided Sharpness-Aware Minimization* | [github](https://github.com/juntang-zhuang/GSAM) | | [cite](https://github.com/juntang-zhuang/GSAM#citation) | +| D-Adaptation | *Learning-Rate-Free Learning by D-Adaptation* | [github](https://github.com/facebookresearch/dadaptation) | | [cite](https://ui.adsabs.harvard.edu/abs/2023arXiv230107733D/exportcitation) | +| AdaFactor | *Adaptive Learning Rates with Sublinear Memory Cost* | [github](https://github.com/DeadAt0m/adafactor-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180404235S/exportcitation) | +| Apollo | *An Adaptive Parameter-wise Diagonal Quasi-Newton Method for Nonconvex Stochastic Optimization* | [github](https://github.com/XuezheMax/apollo) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv200913586M/exportcitation) | +| NovoGrad | *Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks* | [github](https://github.com/lonePatient/NovoGrad-pytorch) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190511286G/exportcitation) | +| Lion | *Symbolic Discovery of Optimization Algorithms* | [github](https://github.com/google/automl/tree/master/lion) | | [cite](https://github.com/google/automl/tree/master/lion#citation) | +| Ali-G | *Adaptive Learning Rates for Interpolation with Gradients* | [github](https://github.com/oval-group/ali-g) | | [cite](https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients) | +| SM3 | *Memory-Efficient Adaptive Optimization* | [github](https://github.com/google-research/google-research/tree/master/sm3) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation) | +| AdaNorm | *Adaptive Gradient Norm Correction based Optimizer for CNNs* | [github](https://github.com/shivram1987/AdaNorm) | | [cite](https://github.com/shivram1987/AdaNorm/tree/main#citation) | +| RotoGrad | *Gradient Homogenization in Multitask Learning* | [github](https://github.com/adrianjav/rotograd) | | [cite](https://github.com/adrianjav/rotograd#citing) | +| A2Grad | *Optimal Adaptive and Accelerated Stochastic Gradient Descent* | [github](https://github.com/severilov/A2Grad_optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000553D/exportcitation) | +| AccSGD | *Accelerating Stochastic Gradient Descent For Least Squares Regression* | [github](https://github.com/rahulkidambi/AccSGD) | | [cite](https://github.com/rahulkidambi/AccSGD#citation) | +| SGDW | *Decoupled Weight Decay Regularization* | [github](https://github.com/loshchil/AdamW-and-SGDW) | | [cite](https://github.com/loshchil/AdamW-and-SGDW#contact) | +| ASGD | *Adaptive Gradient Descent without Descent* | [github](https://github.com/ymalitsky/adaptive_GD) | | [cite](https://github.com/ymalitsky/adaptive_GD#reference) | +| Yogi | *Adaptive Methods for Nonconvex Optimization* | | [NIPS 2018](https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization) | [cite](https://proceedings.neurips.cc/paper_files/paper/2018/hash/90365351ccc7437a1309dc64e4db32a3-Abstract.html) | +| SWATS | *Improving Generalization Performance by Switching from Adam to SGD* | | | [cite](https://ui.adsabs.harvard.edu/abs/2017arXiv171207628S/exportcitation) | +| Fromage | *On the distance between two neural networks and the stability of learning* | [github](https://github.com/jxbz/fromage) | | [cite](https://github.com/jxbz/fromage#citation) | +| MSVAG | *Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients* | [github](https://github.com/lballes/msvag) | | [cite](https://github.com/lballes/msvag#citation) | +| AdaMod | *An Adaptive and Momental Bound Method for Stochastic Learning* | [github](https://github.com/lancopku/AdaMod) | | [cite](https://github.com/lancopku/AdaMod#citation) | +| AggMo | *Aggregated Momentum: Stability Through Passive Damping* | [github](https://github.com/AtheMathmo/AggMo) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180400325L/exportcitation) | +| QHAdam | *Quasi-hyperbolic momentum and Adam for deep learning* | [github](https://github.com/facebookresearch/qhoptim) | | [cite](https://github.com/facebookresearch/qhoptim#reference) | +| PID | *A PID Controller Approach for Stochastic Optimization of Deep Networks* | [github](https://github.com/tensorboy/PIDOptimizer) | [CVPR 18](http://www4.comp.polyu.edu.hk/~cslzhang/paper/CVPR18_PID.pdf) | [cite](https://github.com/tensorboy/PIDOptimizer#citation) | +| Gravity | *a Kinematic Approach on Optimization in Deep Learning* | [github](https://github.com/dariush-bahrami/gravity.optimizer) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210109192B/exportcitation) | +| AdaSmooth | *An Adaptive Learning Rate Method based on Effective Ratio* | | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220400825L/exportcitation) | +| SRMM | *Stochastic regularized majorization-minimization with weakly convex and multi-convex surrogates* | [github](https://github.com/HanbaekLyu/SRMM) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220101652L/exportcitation) | +| AvaGrad | *Domain-independent Dominance of Adaptive Methods* | [github](https://github.com/lolemacs/avagrad) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191201823S/exportcitation) | +| PCGrad | *Gradient Surgery for Multi-Task Learning* | [github](https://github.com/tianheyu927/PCGrad) | | [cite](https://github.com/tianheyu927/PCGrad#reference) | +| AMSGrad | *On the Convergence of Adam and Beyond* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190409237R/exportcitation) | +| Lookahead | *k steps forward, 1 step back* | [github](https://github.com/pytorch/examples/tree/main/imagenet) | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190708610Z/exportcitation) | +| PNM | *Manipulating Stochastic Gradient Noise to Improve Generalization* | [github](https://github.com/zeke-xie/Positive-Negative-Momentum) | | [cite](https://github.com/zeke-xie/Positive-Negative-Momentum#citing) | +| GC | *Gradient Centralization* | [github](https://github.com/Yonghongwei/Gradient-Centralization) | | [cite](https://github.com/Yonghongwei/Gradient-Centralization#citation) | +| AGC | *Adaptive Gradient Clipping* | [github](https://github.com/deepmind/deepmind-research/tree/master/nfnets) | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210206171B/exportcitation) | +| Stable WD | *Understanding and Scheduling Weight Decay* | [github](https://github.com/zeke-xie/stable-weight-decay-regularization) | | [cite](https://ui.adsabs.harvard.edu/abs/2020arXiv201111152X/exportcitation) | +| Softplus T | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation) | +| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization* | | | [cite](https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation) | +| Norm Loss | *An efficient yet effective regularization method for deep neural networks* | | | [cite](https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation) | +| AdaShift | *Decorrelation and Convergence of Adaptive Learning Rate Methods* | [github](https://github.com/MichaelKonobeev/adashift) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation) | +| AdaDelta | *An Adaptive Learning Rate Method* | | | [cite](https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation) | +| Amos | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale* | [github](https://github.com/google-research/jestimator) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation) | +| SignSGD | *Compressed Optimisation for Non-Convex Problems* | [github](https://github.com/jxbz/signSGD) | | [cite](https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation) | +| Sophia | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training* | [github](https://github.com/Liuhong99/Sophia) | | [cite](https://github.com/Liuhong99/Sophia) | +| Prodigy | *An Expeditiously Adaptive Parameter-Free Learner* | [github](https://github.com/konstmish/prodigy) | | [cite](https://github.com/konstmish/prodigy#how-to-cite) | +| PAdam | *Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks* | [github](https://github.com/uclaml/Padam) | | [cite](https://github.com/uclaml/Padam#citation) | +| LOMO | *Full Parameter Fine-tuning for Large Language Models with Limited Resources* | [github](https://github.com/OpenLMLab/LOMO) | | [cite](https://github.com/OpenLMLab/LOMO#citation) | +| Tiger | *A Tight-fisted Optimizer, an optimizer that is extremely budget-conscious* | [github](https://github.com/bojone/tiger) | | [cite](https://github.com/bojone/tiger/blob/main/README_en.md#citation) | +| CAME | *Confidence-guided Adaptive Memory Efficient Optimization* | [github](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME) | | [cite](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/CAME#citation) | +| WSAM | *Sharpness-Aware Minimization Revisited: Weighted Sharpness as a Regularization Term* | [github](https://github.com/intelligent-machine-learning/dlrover/blob/master/atorch/atorch/optimizers/wsam.py) | | [cite](https://github.com/intelligent-machine-learning/dlrover) | +| Aida | *A DNN Optimizer that Improves over AdaBelief by Suppression of the Adaptive Stepsize Range* | [github](https://github.com/guoqiang-zhang-x/Aida-Optimizer) | | [cite](https://github.com/guoqiang-zhang-x/Aida-Optimizer?tab=readme-ov-file#1-brief-description-of-aida) | +| GaLore | *Memory-Efficient LLM Training by Gradient Low-Rank Projection* | [github](https://github.com/jiaweizzhao/GaLore) | | [cite](https://github.com/jiaweizzhao/GaLore/tree/master?tab=readme-ov-file#citation) | +| Adalite | *Adalite optimizer* | [github](https://github.com/VatsaDev/adalite) | | [cite](https://github.com/VatsaDev/adalite) | +| bSAM | *SAM as an Optimal Relaxation of Bayes* | [github](https://github.com/team-approx-bayes/bayesian-sam) | | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv221001620M/exportcitation) | +| Schedule-Free | *Schedule-Free Optimizers* | [github](https://github.com/facebookresearch/schedule_free) | | [cite](https://github.com/facebookresearch/schedule_free) | ## Supported LR Scheduler diff --git a/docs/optimizer.md b/docs/optimizer.md index 0ea4a5049..604f811aa 100644 --- a/docs/optimizer.md +++ b/docs/optimizer.md @@ -240,6 +240,14 @@ :docstring: :members: +::: pytorch_optimizer.ScheduleFreeSGD + :docstring: + :members: + +::: pytorch_optimizer.ScheduleFreeAdamW + :docstring: + :members: + ::: pytorch_optimizer.AccSGD :docstring: :members: From f3e37d6671f195df3300d427159c5527529c8efc Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:48:52 +0900 Subject: [PATCH 16/28] chore: add ScheduleFree optimizers --- pyproject.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eede72a0e..2a298d6e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,10 +15,10 @@ keywords = [ "AdamP", "AdamS", "Adan", "AggMo", "Aida", "AliG", "Amos", "Apollo", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DAdaptLion", "DiffGrad", "Fromage", "GaLore", "Gravity", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", - "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "SGDP", "Shampoo", "ScalableShampoo", - "SGDW", "SignSGD", "SM3", "SopihaH", "SRMM", "SWATS", "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", - "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", - "bitsandbytes", + "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", + "ScheduleFreeAdamW", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SopihaH", "SRMM", "SWATS", + "Tiger", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", + "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", ] classifiers = [ "License :: OSI Approved :: Apache Software License", From 851c486d1938e0e66ad471f47147f62464aa7736 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:49:01 +0900 Subject: [PATCH 17/28] feature: implement ScheduleFree optimizers --- pytorch_optimizer/optimizer/schedulefree.py | 310 ++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 pytorch_optimizer/optimizer/schedulefree.py diff --git a/pytorch_optimizer/optimizer/schedulefree.py b/pytorch_optimizer/optimizer/schedulefree.py new file mode 100644 index 000000000..5d4546d06 --- /dev/null +++ b/pytorch_optimizer/optimizer/schedulefree.py @@ -0,0 +1,310 @@ +import math +from typing import List + +import torch +from torch.optim.optimizer import Optimizer + +from pytorch_optimizer.base.exception import NoSparseGradientError +from pytorch_optimizer.base.optimizer import BaseOptimizer +from pytorch_optimizer.base.types import BETAS, CLOSURE, DEFAULTS, LOSS, PARAMETERS + + +class ScheduleFreeSGD(Optimizer, BaseOptimizer): + r"""Schedule-Free SGD. + + :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups. + :param lr: float. learning rate. + :param momentum: float. momentum factor, must be between 0 and 1 exclusive. + :param weight_decay: float. weight decay (L2 penalty). + :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW. + :param fixed_decay: bool. fix weight decay. + :param r: float. use polynomial weighting in the average with power r. + :param weight_lr_power: float. during warmup, the weights in the average will be equal to lr raised to this power. + set to 0 for no weighting. + :param warmup_steps: int. enables a linear learning rate warmup. + :param eps: float. term added to the denominator to improve numerical stability. + """ + + def __init__( + self, + params: PARAMETERS, + lr: float = 1.0, + momentum: float = 0.9, + weight_decay: float = 0.0, + weight_decouple: bool = True, + fixed_decay: bool = False, + r: float = 0.0, + weight_lr_power: float = 2.0, + warmup_steps: int = 0, + eps: float = 1e-8, + ): + self.validate_learning_rate(lr) + self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[]') + self.validate_non_negative(weight_decay, 'weight_decay') + self.validate_non_negative(eps, 'eps') + + defaults: DEFAULTS = { + 'lr': lr, + 'momentum': momentum, + 'weight_decay': weight_decay, + 'weight_decouple': weight_decouple, + 'fixed_decay': fixed_decay, + 'r': r, + 'weight_lr_power': weight_lr_power, + 'warmup_steps': warmup_steps, + 'eps': eps, + 'train_mode': True, + 'weight_sum': 0.0, + 'lr_max': -1.0, + } + super().__init__(params, defaults) + + self.base_lrs: List[float] = [group['lr'] for group in self.param_groups] + + def __str__(self) -> str: + return 'ScheduleFreeSGD' + + def eval(self): + for group in self.param_groups: + momentum = group['momentum'] + if group['train_mode']: + for p in group['params']: + state = self.state[p] + if 'z' in state: + p.lerp_(end=state['z'], weight=1.0 - 1.0 / momentum) + group['train_mode'] = False + + def train(self): + for group in self.param_groups: + momentum = group['momentum'] + if not group['train_mode']: + for p in group['params']: + state = self.state[p] + if 'z' in state: + p.lerp_(end=state['z'], weight=1.0 - momentum) + group['train_mode'] = True + + @torch.no_grad() + def reset(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + state = self.state[p] + + state['z'] = p.clone() + + @torch.no_grad() + def step(self, closure: CLOSURE = None) -> LOSS: + loss: LOSS = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + warmup_steps: int = group['warmup_steps'] + schedule: float = group['step'] / warmup_steps if group['step'] < warmup_steps else 1.0 + + momentum = group['momentum'] + + lr: float = group['lr'] * schedule + lr_max = group['lr_max'] = max(lr, group['lr_max']) + + weight = (group['step'] ** group['r']) * (lr_max ** group['weight_lr_power']) + weight_sum = group['weight_sum'] = group['weight_sum'] + weight + + checkpoint: float = weight / weight_sum if weight_sum != 0.0 else 0.0 + + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad + if grad.is_sparse: + raise NoSparseGradientError(str(self)) + + state = self.state[p] + + if len(state) == 0: + state['z'] = p.clone() + + self.apply_weight_decay( + p=p, + grad=grad, + lr=lr, + weight_decay=group['weight_decay'], + weight_decouple=group['weight_decouple'], + fixed_decay=group['fixed_decay'], + ) + + z = state['z'] + + p.lerp_(z, weight=checkpoint) + p.add_(grad, alpha=lr * (momentum * (1.0 - checkpoint) - 1)) + + z.sub_(grad, alpha=lr) + + return loss + + +class ScheduleFreeAdamW(Optimizer, BaseOptimizer): + r"""Schedule-Free AdamW. + + :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups. + :param lr: float. learning rate. + :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace. + :param weight_decay: float. weight decay (L2 penalty). + :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW. + :param fixed_decay: bool. fix weight decay. + :param r: float. use polynomial weighting in the average with power r. + :param weight_lr_power: float. during warmup, the weights in the average will be equal to lr raised to this power. + set to 0 for no weighting. + :param warmup_steps: int. enables a linear learning rate warmup. + :param ams_bound: bool. whether to use the AMSBound variant. + :param eps: float. term added to the denominator to improve numerical stability. + """ + + def __init__( + self, + params: PARAMETERS, + lr: float = 2.5e-3, + betas: BETAS = (0.9, 0.999), + weight_decay: float = 0.0, + weight_decouple: bool = True, + fixed_decay: bool = False, + r: float = 0.0, + weight_lr_power: float = 2.0, + warmup_steps: int = 0, + ams_bound: bool = False, + eps: float = 1e-8, + ): + self.validate_learning_rate(lr) + self.validate_betas(betas) + self.validate_non_negative(weight_decay, 'weight_decay') + self.validate_non_negative(eps, 'eps') + + defaults: DEFAULTS = { + 'lr': lr, + 'betas': betas, + 'weight_decay': weight_decay, + 'weight_decouple': weight_decouple, + 'fixed_decay': fixed_decay, + 'r': r, + 'weight_lr_power': weight_lr_power, + 'warmup_steps': warmup_steps, + 'ams_bound': ams_bound, + 'eps': eps, + 'train_mode': True, + 'weight_sum': 0.0, + 'lr_max': -1.0, + } + super().__init__(params, defaults) + + self.base_lrs: List[float] = [group['lr'] for group in self.param_groups] + + def __str__(self) -> str: + return 'ScheduleFreeAdamW' + + def eval(self): + for group in self.param_groups: + beta1, _ = group['betas'] + if group['train_mode']: + for p in group['params']: + state = self.state[p] + if 'z' in state: + p.lerp_(end=state['z'], weight=1.0 - 1.0 / beta1) + group['train_mode'] = False + + def train(self): + for group in self.param_groups: + beta1, _ = group['betas'] + if not group['train_mode']: + for p in group['params']: + state = self.state[p] + if 'z' in state: + p.lerp_(end=state['z'], weight=1.0 - beta1) + group['train_mode'] = True + + @torch.no_grad() + def reset(self): + for group in self.param_groups: + group['step'] = 0 + for p in group['params']: + state = self.state[p] + + state['z'] = p.clone() + state['exp_avg_sq'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self, closure: CLOSURE = None) -> LOSS: + loss: LOSS = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + warmup_steps: int = group['warmup_steps'] + schedule: float = group['step'] / warmup_steps if group['step'] < warmup_steps else 1.0 + + beta1, beta2 = group['betas'] + + bias_correction2_sq: float = math.sqrt(1.0 - beta2 ** group['step']) + + lr: float = group['lr'] * schedule * bias_correction2_sq + lr_max = group['lr_max'] = max(lr, group['lr_max']) + + weight = (group['step'] ** group['r']) * (lr_max ** group['weight_lr_power']) + weight_sum = group['weight_sum'] = group['weight_sum'] + weight + + checkpoint: float = weight / weight_sum if weight_sum != 0.0 else 0.0 + + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad + if grad.is_sparse: + raise NoSparseGradientError(str(self)) + + state = self.state[p] + + if len(state) == 0: + state['z'] = p.clone() + state['exp_avg_sq'] = torch.zeros_like(p) + + self.apply_weight_decay( + p=p, + grad=grad, + lr=lr, + weight_decay=group['weight_decay'], + weight_decouple=group['weight_decouple'], + fixed_decay=group['fixed_decay'], + ) + + z, exp_avg_sq = state['z'], state['exp_avg_sq'] + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2) + + de_nom = self.apply_ams_bound( + ams_bound=group['ams_bound'], + exp_avg_sq=exp_avg_sq, + max_exp_avg_sq=state.get('max_exp_avg_sq', None), + eps=group['eps'], + ) + + grad.div_(de_nom) + + p.lerp_(z, weight=checkpoint) + p.add_(grad, alpha=lr * (beta1 * (1.0 - checkpoint) - 1)) + + z.sub_(grad, alpha=lr) + + return loss From 11ecf092675dc2093cf4f0b13e97b3672901be05 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:49:08 +0900 Subject: [PATCH 18/28] update: test_get_supported_optimizers --- tests/test_load_modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py index 5d899ae5c..f4439498d 100644 --- a/tests/test_load_modules.py +++ b/tests/test_load_modules.py @@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names): def test_get_supported_optimizers(): - assert len(get_supported_optimizers()) == 64 + assert len(get_supported_optimizers()) == 66 def test_get_supported_lr_schedulers(): From 518e32d960f61447254ac5ea49db879b927dd5ad Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:49:14 +0900 Subject: [PATCH 19/28] update: optimizers --- pytorch_optimizer/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py index ce489d12c..1420e5b4a 100644 --- a/pytorch_optimizer/__init__.py +++ b/pytorch_optimizer/__init__.py @@ -80,6 +80,7 @@ from pytorch_optimizer.optimizer.ranger21 import Ranger21 from pytorch_optimizer.optimizer.rotograd import RotoGrad from pytorch_optimizer.optimizer.sam import BSAM, GSAM, SAM, WSAM +from pytorch_optimizer.optimizer.schedulefree import ScheduleFreeAdamW, ScheduleFreeSGD from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD, SignSGD from pytorch_optimizer.optimizer.sgdp import SGDP from pytorch_optimizer.optimizer.shampoo import ScalableShampoo, Shampoo @@ -187,6 +188,8 @@ GaLore, Adalite, BSAM, + ScheduleFreeSGD, + ScheduleFreeAdamW, ] OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST} From 0f2d6b725c0b4cad61f0f3a0b4f0900a37bfacc1 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:53:52 +0900 Subject: [PATCH 20/28] fix: lerp --- pytorch_optimizer/optimizer/schedulefree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_optimizer/optimizer/schedulefree.py b/pytorch_optimizer/optimizer/schedulefree.py index 5d4546d06..0898f531d 100644 --- a/pytorch_optimizer/optimizer/schedulefree.py +++ b/pytorch_optimizer/optimizer/schedulefree.py @@ -71,7 +71,7 @@ def eval(self): for p in group['params']: state = self.state[p] if 'z' in state: - p.lerp_(end=state['z'], weight=1.0 - 1.0 / momentum) + p.data.lerp_(end=state['z'], weight=1.0 - 1.0 / momentum) group['train_mode'] = False def train(self): @@ -81,7 +81,7 @@ def train(self): for p in group['params']: state = self.state[p] if 'z' in state: - p.lerp_(end=state['z'], weight=1.0 - momentum) + p.data.lerp_(end=state['z'], weight=1.0 - momentum) group['train_mode'] = True @torch.no_grad() @@ -216,7 +216,7 @@ def eval(self): for p in group['params']: state = self.state[p] if 'z' in state: - p.lerp_(end=state['z'], weight=1.0 - 1.0 / beta1) + p.data.lerp_(end=state['z'], weight=1.0 - 1.0 / beta1) group['train_mode'] = False def train(self): @@ -226,7 +226,7 @@ def train(self): for p in group['params']: state = self.state[p] if 'z' in state: - p.lerp_(end=state['z'], weight=1.0 - beta1) + p.data.lerp_(end=state['z'], weight=1.0 - beta1) group['train_mode'] = True @torch.no_grad() From 723f9a06b25ec0747f0aba8a6a9b5859e2e4416b Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:54:04 +0900 Subject: [PATCH 21/28] update: test_schedule_free_train_mode --- tests/constants.py | 5 +++++ tests/test_optimizers.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/tests/constants.py b/tests/constants.py index d1572c195..65c0afb33 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -59,6 +59,8 @@ Ranger, Ranger21, ScalableShampoo, + ScheduleFreeAdamW, + ScheduleFreeSGD, Shampoo, SignSGD, SophiaH, @@ -124,6 +126,7 @@ 'galore', 'adalite', 'bsam', + 'schedulefreeadamw', ] VALID_LR_SCHEDULER_NAMES: List[str] = [ @@ -439,6 +442,8 @@ 5, ), (Adalite, {'lr': 1e0, 'weight_decay': 1e-3}, 5), + (ScheduleFreeSGD, {'lr': 1e0, 'weight_decay': 1e-3}, 5), + (ScheduleFreeAdamW, {'lr': 1e0, 'weight_decay': 1e-3}, 5), ] ADANORM_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [ (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'adanorm': True}, 10), diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index c543c7442..4b68b6e7d 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -594,3 +594,17 @@ def test_dynamic_scaler(): scaler = DynamicLossScaler(init_scale=2.0**15, scale_window=1, threshold=1e-2) scaler.decrease_loss_scale() scaler.update_scale(overflow=False) + + +def test_schedule_free_train_mode(): + param = simple_parameter(True) + + opt = load_optimizer('ScheduleFreeAdamW')([param]) + opt.reset() + opt.train() + opt.eval() + + opt = load_optimizer('ScheduleFreeSGD')([param]) + opt.reset() + opt.train() + opt.eval() From 00337606005916a0524f6b8e37d6dd9464ffb2ee Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:55:06 +0900 Subject: [PATCH 22/28] docs: v3.0.0 changelog --- docs/changelogs/v3.0.0.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelogs/v3.0.0.md b/docs/changelogs/v3.0.0.md index e37dc2557..86db0129b 100644 --- a/docs/changelogs/v3.0.0.md +++ b/docs/changelogs/v3.0.0.md @@ -15,6 +15,8 @@ Major version is updated! (`v2.12.0` -> `v3.0.0`) (#164) * Implement `Adalite` optimizer. (#225, #229) * Implement `bSAM` optimizer. (#212, #233) * [SAM as an Optimal Relaxation of Bayes](https://arxiv.org/abs/2210.01620) +* Implement `Schedule-Free` optimizer. (#230, #233) + * [Schedule-Free optimizers](https://github.com/facebookresearch/schedule_free) ### Fix From 8fa4cc4be966131f2ba1bb75984134185d029761 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:57:32 +0900 Subject: [PATCH 23/28] update: test_schedule_free_train_mode --- tests/test_optimizers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 4b68b6e7d..65063bc3b 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -601,10 +601,10 @@ def test_schedule_free_train_mode(): opt = load_optimizer('ScheduleFreeAdamW')([param]) opt.reset() - opt.train() opt.eval() + opt.train() opt = load_optimizer('ScheduleFreeSGD')([param]) opt.reset() - opt.train() opt.eval() + opt.train() From 416d91b80e6078ef019166cacda1418c941818df Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 16:59:54 +0900 Subject: [PATCH 24/28] update: test_reset --- tests/test_optimizers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index 65063bc3b..bce91239a 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -462,13 +462,13 @@ def test_swats_sgd_phase(environment): opt.step() -@pytest.mark.parametrize('optimizer_config', OPTIMIZERS + ADANORM_SUPPORTED_OPTIMIZERS, ids=ids) +@pytest.mark.parametrize( + 'optimizer_config', OPTIMIZERS + ADANORM_SUPPORTED_OPTIMIZERS + [(BSAM, {'num_data': 1}, 1)], ids=ids +) def test_reset(optimizer_config): optimizer_class, config, _ = optimizer_config if optimizer_class.__name__ == 'Ranger21': config.update({'num_iterations': 1}) - elif optimizer_class.__name__ == 'BSAM': - config.update({'num_data': 1}) optimizer = optimizer_class([simple_parameter()], **config) optimizer.reset() From b585827b36c81215721b942c6e52c85811a4abaf Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 17:11:20 +0900 Subject: [PATCH 25/28] feature: implement reg_noise --- pytorch_optimizer/optimizer/utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pytorch_optimizer/optimizer/utils.py b/pytorch_optimizer/optimizer/utils.py index 2539f99c9..ce172a0d4 100644 --- a/pytorch_optimizer/optimizer/utils.py +++ b/pytorch_optimizer/optimizer/utils.py @@ -278,3 +278,19 @@ def reduce_max_except_dim(x: torch.Tensor, dim: int) -> torch.Tensor: if d != dim: x = x.max(dim=d, keepdim=True).values return x + + +def reg_noise( + network1: nn.Module, network2: nn.Module, num_data: int, lr: float, eta: float = 8e-3, temperature: float = 1e-4 +) -> torch.Tensor | float: + reg_coef: float = 0.5 / (eta * num_data) + noise_coef: float = math.sqrt(2.0 / lr / num_data * temperature) + + loss = 0 + for param1, param2 in zip(network1.parameters(), network2.parameters(), strict=True): + reg = torch.sub(param1, param2).pow_(2) * reg_coef + noise1 = param1 * torch.randn_like(param1) * noise_coef + noise2 = param2 * torch.randn_like(param2) * noise_coef + loss += torch.sum(reg - noise1 - noise2) + + return loss From 65e1159912b9eb6221839f0c4896f2ba5b1c8fed Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 17:11:34 +0900 Subject: [PATCH 26/28] docs: EMCMC --- docs/changelogs/v3.0.0.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelogs/v3.0.0.md b/docs/changelogs/v3.0.0.md index 86db0129b..a705e3cbe 100644 --- a/docs/changelogs/v3.0.0.md +++ b/docs/changelogs/v3.0.0.md @@ -17,6 +17,8 @@ Major version is updated! (`v2.12.0` -> `v3.0.0`) (#164) * [SAM as an Optimal Relaxation of Bayes](https://arxiv.org/abs/2210.01620) * Implement `Schedule-Free` optimizer. (#230, #233) * [Schedule-Free optimizers](https://github.com/facebookresearch/schedule_free) +* Implement `EMCMC`. (#231, #233) + * [Entropy-MCMC: Sampling from flat basins with ease](https://www.semanticscholar.org/paper/Entropy-MCMC%3A-Sampling-from-Flat-Basins-with-Ease-Li-Zhang/fd95de3f24fc4f955a6fe5719d38d1d06136e0cd) ### Fix From 2da89167cb7352256e3adfb5144beb5752c2ed36 Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 17:17:48 +0900 Subject: [PATCH 27/28] update: test_emcmc --- tests/test_utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index a02355620..245c44276 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -22,6 +22,7 @@ neuron_norm, normalize_gradient, reduce_max_except_dim, + reg_noise, to_real, unit_norm, ) @@ -228,3 +229,13 @@ def test_max_reduce_except_dim(): x = torch.zeros((1, 1)) with pytest.raises(ValueError): reduce_max_except_dim(x, 3) + + +def test_emcmc(): + torch.random.manual_seed(42) + + network1 = Example() + network2 = Example() + + loss = reg_noise(network1, network2, int(5e4), 1e-1).detach().numpy() + np.testing.assert_almost_equal(loss, 0.0011383) From 8c8b821281f368e3b9759b4ff46bde96f2de67dd Mon Sep 17 00:00:00 2001 From: kozistr Date: Sun, 5 May 2024 17:20:33 +0900 Subject: [PATCH 28/28] docs: E-MCMC --- docs/util.md | 4 ++++ pytorch_optimizer/optimizer/utils.py | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/docs/util.md b/docs/util.md index 8091e6139..26228d266 100644 --- a/docs/util.md +++ b/docs/util.md @@ -84,6 +84,10 @@ :docstring: :members: +::: pytorch_optimizer.optimizer.utils.reg_noise + :docstring: + :members: + ## Newton methods ::: pytorch_optimizer.optimizer.shampoo_utils.power_iteration diff --git a/pytorch_optimizer/optimizer/utils.py b/pytorch_optimizer/optimizer/utils.py index ce172a0d4..7c943a74d 100644 --- a/pytorch_optimizer/optimizer/utils.py +++ b/pytorch_optimizer/optimizer/utils.py @@ -283,6 +283,17 @@ def reduce_max_except_dim(x: torch.Tensor, dim: int) -> torch.Tensor: def reg_noise( network1: nn.Module, network2: nn.Module, num_data: int, lr: float, eta: float = 8e-3, temperature: float = 1e-4 ) -> torch.Tensor | float: + r"""Entropy-MCMC: Sampling from flat basins with ease. + + usage: https://github.com/lblaoke/EMCMC/blob/master/exp/cifar10_emcmc.py + + :param network1: nn.Module. network. + :param network2: nn.Module. network. + :param num_data: int. number of training data. + :param lr: float. learning rate. + :param eta: float. eta. + :param temperature: float. temperature. + """ reg_coef: float = 0.5 / (eta * num_data) noise_coef: float = math.sqrt(2.0 / lr / num_data * temperature)