flatironinstitute · BalzaniEdoardo · Aug 15, 2024 · Jun 21, 2024 · Jun 24, 2024 · Jun 24, 2024
@@ -1350,7 +1350,7 @@ def _check_n_basis_min(self) -> None:
 
 class MSplineBasis(SplineBasis):
     r"""
-    M-spline[$^1$](#references) basis functions for modeling and data transformation.
+    M-spline[$^{[1]}$](#references) basis functions for modeling and data transformation.
 
     M-splines are a type of spline basis function used for smooth curve fitting
     and data representation. They are positive and integrate to one, making them
@@ -1394,8 +1394,8 @@ class MSplineBasis(SplineBasis):
     >>> sample_points = linspace(0, 1, 100)
     >>> basis_functions = mspline_basis(sample_points)
 
-    References
-    ----------
+    # References
+    ------------
     [1] Ramsay, J. O. (1988). Monotone regression splines in action. Statistical science,
         3(4), 425-441.
 
@@ -1517,7 +1517,7 @@ def evaluate_on_grid(self, n_samples: int) -> Tuple[NDArray, NDArray]:
 
 class BSplineBasis(SplineBasis):
     """
-    B-spline[$^1$](#references) 1-dimensional basis functions.
+    B-spline[$^{[1]}$](#references) 1-dimensional basis functions.
 
     Parameters
     ----------
@@ -1546,9 +1546,9 @@ class BSplineBasis(SplineBasis):
         Spline order.
 
 
-    References
-    ----------
-    1. Prautzsch, H., Boehm, W., Paluszny, M. (2002). B-spline representation. In: Bézier and B-Spline Techniques.
+    # References
+    ------------
+    [1] Prautzsch, H., Boehm, W., Paluszny, M. (2002). B-spline representation. In: Bézier and B-Spline Techniques.
         Mathematics and Visualization. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-04919-8_5
 
     """
@@ -1779,7 +1779,7 @@ def evaluate_on_grid(self, n_samples: int) -> Tuple[NDArray, NDArray]:
 class RaisedCosineBasisLinear(Basis):
     """Represent linearly-spaced raised cosine basis functions.
 
-    This implementation is based on the cosine bumps used by Pillow et al.[$^1$](#references)
+    This implementation is based on the cosine bumps used by Pillow et al.[$^{[1]}$](#references)
     to uniformly tile the internal points of the domain.
 
     Parameters
@@ -1801,9 +1801,9 @@ class RaisedCosineBasisLinear(Basis):
         Only used in "conv" mode. Additional keyword arguments that are passed to
         `nemos.convolve.create_convolutional_predictor`
 
-    References
-    ----------
-    1. Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
+    # References
+    ------------
+    [1] Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
         C. E. (2005). Prediction and decoding of retinal ganglion cell responses
         with a probabilistic spiking model. Journal of Neuroscience, 25(47),
         11003–11013. http://dx.doi.org/10.1523/jneurosci.3305-05.2005
@@ -1964,7 +1964,7 @@ class RaisedCosineBasisLog(RaisedCosineBasisLinear):
     """Represent log-spaced raised cosine basis functions.
 
     Similar to `RaisedCosineBasisLinear` but the basis functions are log-spaced.
-    This implementation is based on the cosine bumps used by Pillow et al.[$^1$](#references)
+    This implementation is based on the cosine bumps used by Pillow et al.[$^{[1]}$](#references)
     to uniformly tile the internal points of the domain.
 
     Parameters
@@ -1994,9 +1994,9 @@ class RaisedCosineBasisLog(RaisedCosineBasisLinear):
         Only used in "conv" mode. Additional keyword arguments that are passed to
         `nemos.convolve.create_convolutional_predictor`
 
-    References
-    ----------
-    1. Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
+    # References
+    ------------
+    [1] Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
        C. E. (2005). Prediction and decoding of retinal ganglion cell responses
        with a probabilistic spiking model. Journal of Neuroscience, 25(47),
        11003–11013. http://dx.doi.org/10.1523/jneurosci.3305-05.2005

@@ -267,8 +267,8 @@ def pseudo_r2(
     ) -> jnp.ndarray:
         r"""Pseudo-$R^2$ calculation for a GLM.
 
-        Compute the pseudo-$R^2$ metric for the GLM, as defined by McFadden et al.[$^1$](#references)
-        or by Cohen et al.[$^2$](#references).
+        Compute the pseudo-$R^2$ metric for the GLM, as defined by McFadden et al.[$^{[1]}$](#references)
+        or by Cohen et al.[$^{[2]}$](#references).
 
         This metric evaluates the goodness-of-fit of the model relative to a null (baseline) model that assumes a
         constant mean for the observations. While the pseudo-$R^2$ is bounded between 0 and 1 for the training set,
@@ -311,13 +311,13 @@ def pseudo_r2(
          sample, i.e. the maximum value that the likelihood could possibly achieve). $D_M$ and $D_0$ are
          the model and the null deviance, $D_i = -2 \left[ \log(L_s) - \log(L_i) \right]$ for $i=M,0$.
 
-
-        References
-        ----------
-        1. McFadden D (1979). Quantitative methods for analysing travel behavior of individuals: Some recent
+        # References
+        ------------
+        [1] McFadden D (1979). Quantitative methods for analysing travel behavior of individuals: Some recent
         developments. In D. A. Hensher & P. R. Stopher (Eds.), *Behavioural travel modelling* (pp. 279-318).
         London: Croom Helm.
-        2. Jacob Cohen, Patricia Cohen, Steven G. West, Leona S. Aiken.
+
+        [2] Jacob Cohen, Patricia Cohen, Steven G. West, Leona S. Aiken.
         *Applied Multiple Regression/Correlation Analysis for the Behavioral Sciences*.
         3rd edition. Routledge, 2002. p.502. ISBN 978-0-8058-2223-6. (May 2012)
         """

@@ -60,7 +60,7 @@ def difference_of_gammas(
 
     References
     ----------
-    1. [SciPy Docs - "scipy.stats.gamma"](https://docs.scipy.org/doc/
+    [1] [SciPy Docs - "scipy.stats.gamma"](https://docs.scipy.org/doc/
     scipy/reference/generated/scipy.stats.gamma.html)
     """
     # check that the gamma parameters are positive (scipy returns

@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Callable, NamedTuple, Optional, Union
 
 import jax
 import jax.flatten_util
@@ -30,15 +30,15 @@ class SVRGState(NamedTuple):
         Step size of the individual gradient steps.
     reference_point :
         Anchor/reference/snapshot point where the full gradient is calculated in the SVRG algorithm.
-        Corresponds to `x_{s}` in the pseudocode in [1]
+        Corresponds to `x_{s}` in the pseudocode[$^{[1]}$](#references).
     full_grad_at_reference_point :
         Full gradient at the anchor/reference point.
 
-    References
-    ----------
-    1. [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
-    "Variance-Reduced Methods for Machine Learning." arXiv preprint arXiv:2010.00892 (2020).
-    ](https://arxiv.org/abs/2010.00892)
+    # References
+    ------------
+    [1] [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
+        "Variance-Reduced Methods for Machine Learning." arXiv preprint arXiv:2010.00892 (2020).
+        ](https://arxiv.org/abs/2010.00892)
     """
 
     iter_num: int
@@ -88,13 +88,15 @@ class ProxSVRG:
 
     References
     ----------
-    1. [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
+    [1] [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
     "Variance-Reduced Methods for Machine Learning." arXiv preprint arXiv:2010.00892 (2020).
     ](https://arxiv.org/abs/2010.00892)
-    2. [Xiao, Lin, and Tong Zhang.
+
+    [2] [Xiao, Lin, and Tong Zhang.
     "A proximal stochastic gradient method with progressive variance reduction."
     SIAM Journal on Optimization 24.4 (2014): 2057-2075.](https://arxiv.org/abs/1403.4699v1)
-    3. [Johnson, Rie, and Tong Zhang.
+
+    [3] [Johnson, Rie, and Tong Zhang.
     "Accelerating stochastic gradient descent using predictive variance reduction."
     Advances in neural information processing systems 26 (2013).
     ](https://proceedings.neurips.cc/paper/2013/hash/ac1dd209cbcc5e5d1c6e28598e8cbbe8-Abstract.html)
@@ -122,7 +124,6 @@ def __init__(
     def init_state(
         self,
         init_params: Pytree,
-        hyperparams_prox: Any,
         *args,
     ) -> SVRGState:
         """
@@ -133,9 +134,6 @@ def init_state(
         init_params :
             Pytree containing the initial parameters.
             For GLMs it's a tuple of (W, b)
-        hyperparams_prox :
-            Parameters of the proximal operator, in our case the regularization strength.
-            Not used here, but required to be consistent with the jaxopt API.
         args:
             Positional arguments passed to loss function `fun` and its gradient (e.g. `fun(params, *args)`),
             most likely input and output data.
@@ -384,7 +382,6 @@ def run(
         # initialize the state, including the full gradient at the initial parameters
         init_state = self.init_state(
             init_params,
-            prox_lambda,
             *args,
         )
 
@@ -535,7 +532,6 @@ def _update_per_random_samples(
         N = n_points_per_arg.pop()
 
         m = (N + self.batch_size - 1) // self.batch_size  # number of iterations
-        # m = N
 
         def inner_loop_body(_, carry):
             params, key = carry
@@ -625,12 +621,14 @@ class SVRG(ProxSVRG):
 
     References
     ----------
-    1. [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
+    [1] [Gower, Robert M., Mark Schmidt, Francis Bach, and Peter Richtárik.
     "Variance-Reduced Methods for Machine Learning." arXiv preprint arXiv:2010.00892 (2020).
     ](https://arxiv.org/abs/2010.00892)
-    2. [Xiao, Lin, and Tong Zhang. "A proximal stochastic gradient method with progressive variance reduction."
+
+    [2] [Xiao, Lin, and Tong Zhang. "A proximal stochastic gradient method with progressive variance reduction."
     SIAM Journal on Optimization 24.4 (2014): 2057-2075.](https://arxiv.org/abs/1403.4699v1)
-    3. [Johnson, Rie, and Tong Zhang. "Accelerating stochastic gradient descent using predictive variance reduction."
+
+    [3] [Johnson, Rie, and Tong Zhang. "Accelerating stochastic gradient descent using predictive variance reduction."
     Advances in neural information processing systems 26 (2013).
     ](https://proceedings.neurips.cc/paper/2013/hash/ac1dd209cbcc5e5d1c6e28598e8cbbe8-Abstract.html)
     """
@@ -662,7 +660,6 @@ def init_state(self, init_params: Pytree, *args, **kwargs) -> SVRGState:
         ----------
         init_params :
             pytree containing the initial parameters.
-            For GLMs it's a tuple of (W, b)
         args:
             Positional arguments passed to loss function `fun` and its gradient (e.g. `fun(params, *args)`),
             most likely input and output data.
@@ -680,7 +677,7 @@ def init_state(self, init_params: Pytree, *args, **kwargs) -> SVRGState:
             Initialized optimizer state
         """
         # substitute None for prox_lambda
-        return super().init_state(init_params, None, *args, **kwargs)
+        return super().init_state(init_params, *args, **kwargs)
 
     @partial(jit, static_argnums=(0,))
     def update(self, params: Pytree, state: SVRGState, *args, **kwargs) -> OptStep:

@@ -383,9 +383,9 @@ def row_wise_kron(
     This function computes the row-wise Kronecker product between dense matrices A and C
     using JAX for automatic differentiation and GPU acceleration.
 
-    References
-    ----------
-    1. Petersen, Kaare Brandt, and Michael Syskind Pedersen. "The matrix cookbook."
+    # References
+    ------------
+    [1] Petersen, Kaare Brandt, and Michael Syskind Pedersen. "The matrix cookbook."
     Technical University of Denmark 7.15 (2008): 510.
     """
     if transpose:

@@ -519,10 +519,6 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, mode="eval", test="hi")
@@ -1015,10 +1011,6 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs - 1
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, mode="eval", test="hi")
@@ -1509,10 +1501,6 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs - 1
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, mode="eval", test="hi")
@@ -2066,15 +2054,10 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, [1, 2, 3, 4, 5], 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, decay_rates=[1, 2, 3, 4, 5], mode="eval", test="hi")
 
-
     def test_transformer_get_params(self):
         bas = self.cls(5, decay_rates=[1, 2, 3, 4, 5])
         bas_transformer = bas.to_transformer()
@@ -2507,10 +2490,6 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs - 1
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, mode="eval", test="hi")
@@ -3039,10 +3018,6 @@ def test_identifiability_constraint_apply(self):
         assert np.allclose(X.mean(axis=0), np.zeros(X.shape[1]))
         assert X.shape[1] == bas.n_basis_funcs - 1
 
-    def test_conv_args_error(self):
-        with pytest.raises(ValueError, match="args should only be set"):
-            self.cls(5, 10, mode="eval")
-
     def test_conv_kwargs_error(self):
         with pytest.raises(ValueError, match="kwargs should only be set"):
             self.cls(5, mode="eval", test="hi")
@@ -3749,12 +3724,8 @@ def test_compute_features_returns_expected_number_of_basis(
         )
         if eval_basis.shape[1] != basis_a_obj.n_basis_funcs * basis_b_obj.n_basis_funcs:
             raise ValueError(
-# <<<<<<< HEAD
                 "Dimensions do not agree: The number of basis should match the first dimension of the "
                 "fit_transformed basis."
-# =======
-#                 "Dimensions do not agree: The number of basis should match the first dimension of the output features."
-# >>>>>>> development
                 f"The number of basis is {n_basis_a * n_basis_b}",
                 f"The first dimension of the output features is {eval_basis.shape[1]}",
             )