diff --git a/docs/changelogs/v3.1.2.md b/docs/changelogs/v3.1.2.md new file mode 100644 index 000000000..70cff0628 --- /dev/null +++ b/docs/changelogs/v3.1.2.md @@ -0,0 +1,5 @@ +## Change Log + +### Bug + +* Add `**kwargs` to the parameters for dummy placeholder. (#270, #271) diff --git a/pyproject.toml b/pyproject.toml index 8efa3ae5f..9aae7371d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ lint.select = [ ] lint.ignore = [ "B905", "D100", "D102", "D104", "D105", "D107", "D203", "D213", "D413", "PIE790", "PLR0912", "PLR0913", "PLR0915", - "PLR2004", "RUF013", "Q003", + "PLR2004", "RUF013", "Q003", "ARG002", ] lint.fixable = ["ALL"] lint.unfixable = ["F401"] diff --git a/pytorch_optimizer/lr_scheduler/rex.py b/pytorch_optimizer/lr_scheduler/rex.py index 2c6974911..5d7ad3d31 100644 --- a/pytorch_optimizer/lr_scheduler/rex.py +++ b/pytorch_optimizer/lr_scheduler/rex.py @@ -52,7 +52,7 @@ def get_linear_lr(self) -> float: return self.min_lr + (self.max_lr - self.min_lr) * ((1.0 - progress) / (1.0 - progress / 2.0)) - def step(self, epoch: Optional[int] = None) -> float: # noqa: ARG002 + def step(self, epoch: Optional[int] = None) -> float: value: float = self.get_linear_lr() self.step_t += 1 diff --git a/pytorch_optimizer/optimizer/a2grad.py b/pytorch_optimizer/optimizer/a2grad.py index 93d91adc9..6da2604f5 100644 --- a/pytorch_optimizer/optimizer/a2grad.py +++ b/pytorch_optimizer/optimizer/a2grad.py @@ -27,6 +27,7 @@ def __init__( lips: float = 10.0, rho: float = 0.5, variant: str = 'uni', + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(lips, 'lips') diff --git a/pytorch_optimizer/optimizer/adabelief.py b/pytorch_optimizer/optimizer/adabelief.py index 23a35a59e..f42cc34c8 100644 --- a/pytorch_optimizer/optimizer/adabelief.py +++ b/pytorch_optimizer/optimizer/adabelief.py @@ -42,6 +42,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-16, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adabound.py b/pytorch_optimizer/optimizer/adabound.py index 114ef1b14..6685904f5 100644 --- a/pytorch_optimizer/optimizer/adabound.py +++ b/pytorch_optimizer/optimizer/adabound.py @@ -37,6 +37,7 @@ def __init__( ams_bound: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adadelta.py b/pytorch_optimizer/optimizer/adadelta.py index 75693a773..4b3dd79a5 100644 --- a/pytorch_optimizer/optimizer/adadelta.py +++ b/pytorch_optimizer/optimizer/adadelta.py @@ -26,6 +26,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(rho, 'rho', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/adafactor.py b/pytorch_optimizer/optimizer/adafactor.py index bd9ca02d6..fa2a26968 100644 --- a/pytorch_optimizer/optimizer/adafactor.py +++ b/pytorch_optimizer/optimizer/adafactor.py @@ -49,6 +49,7 @@ def __init__( eps1: float = 1e-30, eps2: float = 1e-3, momentum_dtype: torch.dtype = torch.bfloat16, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adahessian.py b/pytorch_optimizer/optimizer/adahessian.py index 284c10e43..4cce37ac5 100644 --- a/pytorch_optimizer/optimizer/adahessian.py +++ b/pytorch_optimizer/optimizer/adahessian.py @@ -40,6 +40,7 @@ def __init__( hessian_distribution: HUTCHINSON_G = 'rademacher', adam_debias: bool = False, eps: float = 1e-16, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adai.py b/pytorch_optimizer/optimizer/adai.py index 607c1b4ec..c5b173bd2 100644 --- a/pytorch_optimizer/optimizer/adai.py +++ b/pytorch_optimizer/optimizer/adai.py @@ -35,6 +35,7 @@ def __init__( dampening: float = 1.0, use_gc: bool = False, eps: float = 1e-3, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adalite.py b/pytorch_optimizer/optimizer/adalite.py index 4047b4c66..cf43aa824 100644 --- a/pytorch_optimizer/optimizer/adalite.py +++ b/pytorch_optimizer/optimizer/adalite.py @@ -35,6 +35,7 @@ def __init__( tau: float = 1.0, eps1: float = 1e-6, eps2: float = 1e-10, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adam_mini.py b/pytorch_optimizer/optimizer/adam_mini.py index 3f6a308da..28e33484a 100644 --- a/pytorch_optimizer/optimizer/adam_mini.py +++ b/pytorch_optimizer/optimizer/adam_mini.py @@ -38,6 +38,7 @@ def __init__( num_heads: int = 32, num_query_groups: Optional[int] = None, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adamax.py b/pytorch_optimizer/optimizer/adamax.py index bafae8109..b0579929b 100644 --- a/pytorch_optimizer/optimizer/adamax.py +++ b/pytorch_optimizer/optimizer/adamax.py @@ -32,6 +32,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adamg.py b/pytorch_optimizer/optimizer/adamg.py index 79689d03c..dd7e47670 100644 --- a/pytorch_optimizer/optimizer/adamg.py +++ b/pytorch_optimizer/optimizer/adamg.py @@ -32,6 +32,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adamod.py b/pytorch_optimizer/optimizer/adamod.py index 6f30433dc..45781ebe9 100644 --- a/pytorch_optimizer/optimizer/adamod.py +++ b/pytorch_optimizer/optimizer/adamod.py @@ -31,6 +31,7 @@ def __init__( fixed_decay: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adamp.py b/pytorch_optimizer/optimizer/adamp.py index 0adb79f16..ae9019b0e 100644 --- a/pytorch_optimizer/optimizer/adamp.py +++ b/pytorch_optimizer/optimizer/adamp.py @@ -45,6 +45,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adams.py b/pytorch_optimizer/optimizer/adams.py index 1b1717169..d5b907825 100644 --- a/pytorch_optimizer/optimizer/adams.py +++ b/pytorch_optimizer/optimizer/adams.py @@ -36,6 +36,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adamw.py b/pytorch_optimizer/optimizer/adamw.py index 9de9768f1..a99d6cbbc 100644 --- a/pytorch_optimizer/optimizer/adamw.py +++ b/pytorch_optimizer/optimizer/adamw.py @@ -29,6 +29,7 @@ def __init__( weight_decay: float = 1e-2, weight_decouple: bool = True, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adan.py b/pytorch_optimizer/optimizer/adan.py index ebd4557e6..7a24c2d93 100644 --- a/pytorch_optimizer/optimizer/adan.py +++ b/pytorch_optimizer/optimizer/adan.py @@ -37,6 +37,7 @@ def __init__( r: float = 0.95, adanorm: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adanorm.py b/pytorch_optimizer/optimizer/adanorm.py index 9da5fc1a7..056890b4f 100644 --- a/pytorch_optimizer/optimizer/adanorm.py +++ b/pytorch_optimizer/optimizer/adanorm.py @@ -34,6 +34,7 @@ def __init__( ams_bound: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adapnm.py b/pytorch_optimizer/optimizer/adapnm.py index 22389a6c5..153984f80 100644 --- a/pytorch_optimizer/optimizer/adapnm.py +++ b/pytorch_optimizer/optimizer/adapnm.py @@ -36,6 +36,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adashift.py b/pytorch_optimizer/optimizer/adashift.py index bdb44d841..bd8179d05 100644 --- a/pytorch_optimizer/optimizer/adashift.py +++ b/pytorch_optimizer/optimizer/adashift.py @@ -28,6 +28,7 @@ def __init__( keep_num: int = 10, reduce_func: Optional[Callable] = torch.max, eps: float = 1e-10, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/adasmooth.py b/pytorch_optimizer/optimizer/adasmooth.py index 8d65b1a3c..3b28ba91e 100644 --- a/pytorch_optimizer/optimizer/adasmooth.py +++ b/pytorch_optimizer/optimizer/adasmooth.py @@ -26,6 +26,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/aggmo.py b/pytorch_optimizer/optimizer/aggmo.py index 1739e4f3c..a3c3b967c 100644 --- a/pytorch_optimizer/optimizer/aggmo.py +++ b/pytorch_optimizer/optimizer/aggmo.py @@ -24,6 +24,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/aida.py b/pytorch_optimizer/optimizer/aida.py index e04c89ad9..5d790946f 100644 --- a/pytorch_optimizer/optimizer/aida.py +++ b/pytorch_optimizer/optimizer/aida.py @@ -46,6 +46,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/alig.py b/pytorch_optimizer/optimizer/alig.py index 75447924b..2931f35a2 100644 --- a/pytorch_optimizer/optimizer/alig.py +++ b/pytorch_optimizer/optimizer/alig.py @@ -25,6 +25,7 @@ def __init__( projection_fn: Optional[Callable] = None, momentum: float = 0.0, adjusted_momentum: bool = False, + **kwargs, ): self.validate_learning_rate(max_lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/amos.py b/pytorch_optimizer/optimizer/amos.py index 1b76f1721..1212815fb 100644 --- a/pytorch_optimizer/optimizer/amos.py +++ b/pytorch_optimizer/optimizer/amos.py @@ -31,6 +31,7 @@ def __init__( c_coef: float = 0.25, d_coef: float = 0.25, eps: float = 1e-18, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[)') diff --git a/pytorch_optimizer/optimizer/apollo.py b/pytorch_optimizer/optimizer/apollo.py index 785eef28f..04ca47cf2 100644 --- a/pytorch_optimizer/optimizer/apollo.py +++ b/pytorch_optimizer/optimizer/apollo.py @@ -33,6 +33,7 @@ def __init__( weight_decay_type: str = 'l2', warmup_steps: int = 500, eps: float = 1e-4, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(beta, 'beta', 0.0, 1.0, range_type='[]') diff --git a/pytorch_optimizer/optimizer/avagrad.py b/pytorch_optimizer/optimizer/avagrad.py index 0b9f52dbf..f759e6ee1 100644 --- a/pytorch_optimizer/optimizer/avagrad.py +++ b/pytorch_optimizer/optimizer/avagrad.py @@ -30,6 +30,7 @@ def __init__( fixed_decay: bool = False, adam_debias: bool = False, eps: float = 1e-1, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/came.py b/pytorch_optimizer/optimizer/came.py index 411fab23e..ce6308abe 100644 --- a/pytorch_optimizer/optimizer/came.py +++ b/pytorch_optimizer/optimizer/came.py @@ -35,6 +35,7 @@ def __init__( ams_bound: bool = False, eps1: float = 1e-30, eps2: float = 1e-16, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/dadapt.py b/pytorch_optimizer/optimizer/dadapt.py index 31ab60eae..5001d8ef4 100644 --- a/pytorch_optimizer/optimizer/dadapt.py +++ b/pytorch_optimizer/optimizer/dadapt.py @@ -39,6 +39,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 0.0, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[)') @@ -266,6 +267,7 @@ def __init__( fixed_decay: bool = False, bias_correction: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) @@ -423,6 +425,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[)') @@ -560,6 +563,7 @@ def __init__( d0: float = 1e-6, growth_rate: float = float('inf'), eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) @@ -721,6 +725,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/diffgrad.py b/pytorch_optimizer/optimizer/diffgrad.py index fd94295ef..da14d98e2 100644 --- a/pytorch_optimizer/optimizer/diffgrad.py +++ b/pytorch_optimizer/optimizer/diffgrad.py @@ -40,6 +40,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/fadam.py b/pytorch_optimizer/optimizer/fadam.py index 91a89dc3a..747399846 100644 --- a/pytorch_optimizer/optimizer/fadam.py +++ b/pytorch_optimizer/optimizer/fadam.py @@ -30,6 +30,7 @@ def __init__( eps: float = 1e-8, momentum_dtype: torch.dtype = torch.float32, fim_dtype: torch.dtype = torch.float32, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/fromage.py b/pytorch_optimizer/optimizer/fromage.py index a45677697..c14716549 100644 --- a/pytorch_optimizer/optimizer/fromage.py +++ b/pytorch_optimizer/optimizer/fromage.py @@ -22,7 +22,7 @@ class Fromage(BaseOptimizer): norms to lie within 2x their initial norms. This regularises the model class. """ - def __init__(self, params: PARAMETERS, lr: float = 1e-2, p_bound: Optional[float] = None): + def __init__(self, params: PARAMETERS, lr: float = 1e-2, p_bound: Optional[float] = None, **kwargs): self.validate_learning_rate(lr) self.p_bound = p_bound diff --git a/pytorch_optimizer/optimizer/galore.py b/pytorch_optimizer/optimizer/galore.py index c7ce06dc8..e05921199 100644 --- a/pytorch_optimizer/optimizer/galore.py +++ b/pytorch_optimizer/optimizer/galore.py @@ -21,7 +21,12 @@ class GaLoreProjector: """ def __init__( - self, rank: int = 128, update_proj_gap: int = 50, scale: float = 1.0, projection_type: PROJECTION_TYPE = 'std' + self, + rank: int = 128, + update_proj_gap: int = 50, + scale: float = 1.0, + projection_type: PROJECTION_TYPE = 'std', + **kwargs, ): self.rank = rank self.update_proj_gap = update_proj_gap diff --git a/pytorch_optimizer/optimizer/gravity.py b/pytorch_optimizer/optimizer/gravity.py index c7e7ed546..61e95669a 100644 --- a/pytorch_optimizer/optimizer/gravity.py +++ b/pytorch_optimizer/optimizer/gravity.py @@ -20,6 +20,7 @@ def __init__( lr: float = 1e-2, alpha: float = 0.01, beta: float = 0.9, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(alpha, 'alpha', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/grokfast.py b/pytorch_optimizer/optimizer/grokfast.py index ad8dc0c43..ef733a0b3 100644 --- a/pytorch_optimizer/optimizer/grokfast.py +++ b/pytorch_optimizer/optimizer/grokfast.py @@ -128,6 +128,7 @@ def __init__( fixed_decay: bool = False, normalize_lr: bool = True, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/kate.py b/pytorch_optimizer/optimizer/kate.py index f006d1b4c..49fc1e308 100644 --- a/pytorch_optimizer/optimizer/kate.py +++ b/pytorch_optimizer/optimizer/kate.py @@ -26,6 +26,7 @@ def __init__( weight_decouple: bool = True, fixed_decay: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(delta, 'delta', 0.0, 1.0, '[)') diff --git a/pytorch_optimizer/optimizer/lamb.py b/pytorch_optimizer/optimizer/lamb.py index 0b4ffbd90..464218e5b 100644 --- a/pytorch_optimizer/optimizer/lamb.py +++ b/pytorch_optimizer/optimizer/lamb.py @@ -53,6 +53,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/lars.py b/pytorch_optimizer/optimizer/lars.py index 32c7f4710..c3c979a86 100644 --- a/pytorch_optimizer/optimizer/lars.py +++ b/pytorch_optimizer/optimizer/lars.py @@ -26,6 +26,7 @@ def __init__( dampening: float = 0.0, trust_coefficient: float = 1e-3, nesterov: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(weight_decay, 'weight_decay') diff --git a/pytorch_optimizer/optimizer/lion.py b/pytorch_optimizer/optimizer/lion.py index 98f2faf0e..5665ad31c 100644 --- a/pytorch_optimizer/optimizer/lion.py +++ b/pytorch_optimizer/optimizer/lion.py @@ -31,6 +31,7 @@ def __init__( use_gc: bool = False, r: float = 0.95, adanorm: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/lomo.py b/pytorch_optimizer/optimizer/lomo.py index 63610c1b2..bfef3d89a 100644 --- a/pytorch_optimizer/optimizer/lomo.py +++ b/pytorch_optimizer/optimizer/lomo.py @@ -30,6 +30,7 @@ def __init__( lr: float = 1e-3, clip_grad_norm: Optional[float] = None, clip_grad_value: Optional[float] = None, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(clip_grad_norm, 'clip_grad_norm') @@ -228,6 +229,7 @@ def __init__( clip_grad_value: Optional[float] = None, eps1: float = 1e-30, eps2: float = 1e-3, + **kwargs, ) -> None: # fmt: skip self.validate_learning_rate(lr) self.validate_non_negative(weight_decay, 'weight_decay') diff --git a/pytorch_optimizer/optimizer/lookahead.py b/pytorch_optimizer/optimizer/lookahead.py index 2291d50a2..63a15ee15 100644 --- a/pytorch_optimizer/optimizer/lookahead.py +++ b/pytorch_optimizer/optimizer/lookahead.py @@ -22,6 +22,7 @@ def __init__( k: int = 5, alpha: float = 0.5, pullback_momentum: str = 'none', + **kwargs, ) -> None: self.validate_positive(k, 'k') self.validate_range(alpha, 'alpha', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/madgrad.py b/pytorch_optimizer/optimizer/madgrad.py index 52b8e12e9..743489e25 100644 --- a/pytorch_optimizer/optimizer/madgrad.py +++ b/pytorch_optimizer/optimizer/madgrad.py @@ -32,6 +32,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(weight_decay, 'weight_decay') diff --git a/pytorch_optimizer/optimizer/msvag.py b/pytorch_optimizer/optimizer/msvag.py index 792d87eb8..1cf34efd2 100644 --- a/pytorch_optimizer/optimizer/msvag.py +++ b/pytorch_optimizer/optimizer/msvag.py @@ -13,7 +13,13 @@ class MSVAG(BaseOptimizer): :param beta: float. Moving average (momentum) constant (scalar tensor or float value). """ - def __init__(self, params: PARAMETERS, lr: float = 1e-2, beta: float = 0.9): + def __init__( + self, + params: PARAMETERS, + lr: float = 1e-2, + beta: float = 0.9, + **kwargs, + ): self.validate_learning_rate(lr) self.validate_range(beta, 'beta', 0.0, 1.0, range_type='[]') diff --git a/pytorch_optimizer/optimizer/nero.py b/pytorch_optimizer/optimizer/nero.py index 06e499863..bf4c594ca 100644 --- a/pytorch_optimizer/optimizer/nero.py +++ b/pytorch_optimizer/optimizer/nero.py @@ -17,7 +17,13 @@ class Nero(BaseOptimizer): """ def __init__( - self, params: PARAMETERS, lr: float = 0.01, beta: float = 0.999, constraints: bool = True, eps: float = 1e-8 + self, + params: PARAMETERS, + lr: float = 0.01, + beta: float = 0.999, + constraints: bool = True, + eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(beta, 'beta', 0.0, 1.0, range_type='[]') diff --git a/pytorch_optimizer/optimizer/novograd.py b/pytorch_optimizer/optimizer/novograd.py index f9aa937c5..9070eab73 100644 --- a/pytorch_optimizer/optimizer/novograd.py +++ b/pytorch_optimizer/optimizer/novograd.py @@ -32,6 +32,7 @@ def __init__( grad_averaging: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/padam.py b/pytorch_optimizer/optimizer/padam.py index c15d35f31..011323876 100644 --- a/pytorch_optimizer/optimizer/padam.py +++ b/pytorch_optimizer/optimizer/padam.py @@ -30,6 +30,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/pid.py b/pytorch_optimizer/optimizer/pid.py index b148a203c..c537c1b12 100644 --- a/pytorch_optimizer/optimizer/pid.py +++ b/pytorch_optimizer/optimizer/pid.py @@ -30,6 +30,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/pnm.py b/pytorch_optimizer/optimizer/pnm.py index 6d16822a8..4cd0af044 100644 --- a/pytorch_optimizer/optimizer/pnm.py +++ b/pytorch_optimizer/optimizer/pnm.py @@ -28,6 +28,7 @@ def __init__( weight_decouple: bool = True, fixed_decay: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/prodigy.py b/pytorch_optimizer/optimizer/prodigy.py index ba0bb7b06..97ac1a336 100644 --- a/pytorch_optimizer/optimizer/prodigy.py +++ b/pytorch_optimizer/optimizer/prodigy.py @@ -44,6 +44,7 @@ def __init__( bias_correction: bool = False, safeguard_warmup: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas((*betas, beta3)) diff --git a/pytorch_optimizer/optimizer/qhadam.py b/pytorch_optimizer/optimizer/qhadam.py index 88700be5d..69789c882 100644 --- a/pytorch_optimizer/optimizer/qhadam.py +++ b/pytorch_optimizer/optimizer/qhadam.py @@ -30,6 +30,7 @@ def __init__( weight_decouple: bool = False, fixed_decay: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/qhm.py b/pytorch_optimizer/optimizer/qhm.py index 986edb3bd..2cf1d21c6 100644 --- a/pytorch_optimizer/optimizer/qhm.py +++ b/pytorch_optimizer/optimizer/qhm.py @@ -26,6 +26,7 @@ def __init__( weight_decay: float = 0.0, weight_decouple: bool = False, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/radam.py b/pytorch_optimizer/optimizer/radam.py index 6243f7f95..3ec09dd5d 100644 --- a/pytorch_optimizer/optimizer/radam.py +++ b/pytorch_optimizer/optimizer/radam.py @@ -36,6 +36,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/ranger.py b/pytorch_optimizer/optimizer/ranger.py index 021166977..472712615 100644 --- a/pytorch_optimizer/optimizer/ranger.py +++ b/pytorch_optimizer/optimizer/ranger.py @@ -43,6 +43,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-5, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/ranger21.py b/pytorch_optimizer/optimizer/ranger21.py index db22febd3..b31f69bbe 100644 --- a/pytorch_optimizer/optimizer/ranger21.py +++ b/pytorch_optimizer/optimizer/ranger21.py @@ -82,6 +82,7 @@ def __init__( # pylint: disable=R0913 norm_loss_factor: float = 1e-4, adam_debias: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_learning_rate(warm_down_min_lr) diff --git a/pytorch_optimizer/optimizer/rotograd.py b/pytorch_optimizer/optimizer/rotograd.py index ce4be27d2..0d367d4a3 100644 --- a/pytorch_optimizer/optimizer/rotograd.py +++ b/pytorch_optimizer/optimizer/rotograd.py @@ -176,7 +176,7 @@ def __init__( backbone: nn.Module, heads: Sequence[nn.Module], latent_size: int, - *args, # noqa: ARG002 + *args, burn_in_period: int = 20, normalize_losses: bool = False, ): diff --git a/pytorch_optimizer/optimizer/schedulefree.py b/pytorch_optimizer/optimizer/schedulefree.py index aada3cfa0..9e2436e64 100644 --- a/pytorch_optimizer/optimizer/schedulefree.py +++ b/pytorch_optimizer/optimizer/schedulefree.py @@ -36,6 +36,7 @@ def __init__( weight_lr_power: float = 2.0, warmup_steps: int = 0, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0, range_type='[]') @@ -180,6 +181,7 @@ def __init__( warmup_steps: int = 0, ams_bound: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/sgd.py b/pytorch_optimizer/optimizer/sgd.py index e94c67796..c3e668a86 100644 --- a/pytorch_optimizer/optimizer/sgd.py +++ b/pytorch_optimizer/optimizer/sgd.py @@ -27,6 +27,7 @@ def __init__( xi: float = 10.0, constant: float = 0.7, weight_decay: float = 0.0, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(kappa, 'kappa') @@ -124,6 +125,7 @@ def __init__( weight_decouple: bool = True, dampening: float = 0.0, nesterov: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) @@ -222,6 +224,7 @@ def __init__( theta: float = 1.0, dampening: float = 1.0, eps: float = 1e-5, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(amplifier, 'amplifier') @@ -339,6 +342,7 @@ def __init__( momentum: float = 0.9, weight_decay: float = 0.0, weight_decouple: bool = True, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'beta', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/sgdp.py b/pytorch_optimizer/optimizer/sgdp.py index 63cc54827..35691dbbb 100644 --- a/pytorch_optimizer/optimizer/sgdp.py +++ b/pytorch_optimizer/optimizer/sgdp.py @@ -36,6 +36,7 @@ def __init__( wd_ratio: float = 0.1, nesterov: bool = False, eps: float = 1e-8, + **kwargs, ): self.validate_learning_rate(lr) self.validate_non_negative(weight_decay, 'weight_decay') diff --git a/pytorch_optimizer/optimizer/shampoo.py b/pytorch_optimizer/optimizer/shampoo.py index d1c08ba2b..f3f37c4e4 100644 --- a/pytorch_optimizer/optimizer/shampoo.py +++ b/pytorch_optimizer/optimizer/shampoo.py @@ -36,6 +36,7 @@ def __init__( fixed_decay: bool = False, preconditioning_compute_steps: int = 1, matrix_eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) @@ -211,6 +212,7 @@ def __init__( diagonal_eps: float = 1e-10, matrix_eps: float = 1e-6, use_svd: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/shampoo_utils.py b/pytorch_optimizer/optimizer/shampoo_utils.py index cb9b9ec1a..cb4a7884b 100644 --- a/pytorch_optimizer/optimizer/shampoo_utils.py +++ b/pytorch_optimizer/optimizer/shampoo_utils.py @@ -37,7 +37,7 @@ def precondition_gradient(self, grad: torch.Tensor) -> torch.Tensor: r"""Get preconditioned gradient.""" return grad - def update_momentum(self, update: torch.Tensor, unused_beta1: float) -> torch.Tensor: # noqa: ARG002 + def update_momentum(self, update: torch.Tensor, unused_beta1: float) -> torch.Tensor: r"""Update momentum.""" return update diff --git a/pytorch_optimizer/optimizer/sm3.py b/pytorch_optimizer/optimizer/sm3.py index 35b0c59ad..984d98103 100644 --- a/pytorch_optimizer/optimizer/sm3.py +++ b/pytorch_optimizer/optimizer/sm3.py @@ -23,6 +23,7 @@ def __init__( momentum: float = 0.0, beta: float = 0.0, eps: float = 1e-30, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(momentum, 'momentum', 0.0, 1.0) diff --git a/pytorch_optimizer/optimizer/sophia.py b/pytorch_optimizer/optimizer/sophia.py index c46c64bc8..cc0a5544c 100644 --- a/pytorch_optimizer/optimizer/sophia.py +++ b/pytorch_optimizer/optimizer/sophia.py @@ -38,6 +38,7 @@ def __init__( num_samples: int = 1, hessian_distribution: HUTCHINSON_G = 'gaussian', eps: float = 1e-12, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/srmm.py b/pytorch_optimizer/optimizer/srmm.py index 5c73f9a5c..1669eba37 100644 --- a/pytorch_optimizer/optimizer/srmm.py +++ b/pytorch_optimizer/optimizer/srmm.py @@ -16,7 +16,14 @@ class SRMM(BaseOptimizer): :param memory_length: Optional[int]. internal memory length for moving average. None for no refreshing. """ - def __init__(self, params: PARAMETERS, lr: float = 0.01, beta: float = 0.5, memory_length: Optional[int] = 100): + def __init__( + self, + params: PARAMETERS, + lr: float = 0.01, + beta: float = 0.5, + memory_length: Optional[int] = 100, + **kwargs, + ): self.validate_learning_rate(lr) self.validate_range(beta, 'beta', 0.0, 1.0, range_type='[]') diff --git a/pytorch_optimizer/optimizer/swats.py b/pytorch_optimizer/optimizer/swats.py index 0f1705e2f..6ec5b16ec 100644 --- a/pytorch_optimizer/optimizer/swats.py +++ b/pytorch_optimizer/optimizer/swats.py @@ -38,6 +38,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-6, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/pytorch_optimizer/optimizer/tiger.py b/pytorch_optimizer/optimizer/tiger.py index 101efa71c..637a4ebc0 100644 --- a/pytorch_optimizer/optimizer/tiger.py +++ b/pytorch_optimizer/optimizer/tiger.py @@ -24,6 +24,7 @@ def __init__( weight_decay: float = 0.01, weight_decouple: bool = True, fixed_decay: bool = False, + **kwargs, ): self.validate_learning_rate(lr) self.validate_range(beta, 'beta', 0.0, 1.0, range_type='[)') diff --git a/pytorch_optimizer/optimizer/trac.py b/pytorch_optimizer/optimizer/trac.py index 7c13d70ac..8b51aa780 100644 --- a/pytorch_optimizer/optimizer/trac.py +++ b/pytorch_optimizer/optimizer/trac.py @@ -105,6 +105,7 @@ def __init__( num_coefs: int = 128, s_prev: float = 1e-8, eps: float = 1e-8, + **kwargs, ): self.validate_positive(num_coefs, 'num_coefs') self.validate_non_negative(s_prev, 's_prev') diff --git a/pytorch_optimizer/optimizer/yogi.py b/pytorch_optimizer/optimizer/yogi.py index 7964e6c8b..eef7455d8 100644 --- a/pytorch_optimizer/optimizer/yogi.py +++ b/pytorch_optimizer/optimizer/yogi.py @@ -36,6 +36,7 @@ def __init__( adanorm: bool = False, adam_debias: bool = False, eps: float = 1e-3, + **kwargs, ): self.validate_learning_rate(lr) self.validate_betas(betas) diff --git a/tests/test_create_optimizer.py b/tests/test_create_optimizer.py index 93a3a0e16..9daa047c2 100644 --- a/tests/test_create_optimizer.py +++ b/tests/test_create_optimizer.py @@ -1,15 +1,28 @@ import pytest from pytorch_optimizer import create_optimizer, load_optimizer +from tests.constants import VALID_OPTIMIZER_NAMES from tests.utils import LogisticRegression -def test_create_optimizer(): - model = LogisticRegression() - - create_optimizer(model, 'adamp', lr=1e-2, weight_decay=1e-3, use_gc=True, use_lookahead=True) - create_optimizer(model, 'alig', lr=1e-2, use_lookahead=True) - create_optimizer(model, 'adalomo', lr=1e-2, use_lookahead=False) +@pytest.mark.parametrize('use_lookahead', [True, False]) +@pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES) +def test_create_optimizer(use_lookahead, optimizer_name): + if optimizer_name == 'adamw': + pytest.skip(f'skip {optimizer_name}') + + kwargs = {'eps': 1e-8, 'k': 7} + if optimizer_name == 'ranger21': + kwargs.update({'num_iterations': 1}) + elif optimizer_name == 'bsam': + kwargs.update({'num_data': 1}) + + create_optimizer( + LogisticRegression(), + optimizer_name=optimizer_name, + use_lookahead=use_lookahead, + **kwargs, + ) def test_bnb_optimizer():