flatironinstitute · BalzaniEdoardo · Aug 15, 2024 · Jun 21, 2024 · Jun 24, 2024 · Jun 24, 2024
@@ -14,7 +14,7 @@
 import jaxopt
 from numpy.typing import ArrayLike, NDArray
 
-from . import utils, validation
+from . import solvers, utils, validation
 from ._regularizer_builder import AVAILABLE_REGULARIZERS, create_regularizer
 from .base_class import Base
 from .regularizer import Regularizer, UnRegularized
@@ -218,18 +218,20 @@ def solver_kwargs(self):
     @solver_kwargs.setter
     def solver_kwargs(self, solver_kwargs: dict):
         """Setter for the solver_kwargs attribute."""
-        self._check_solver_kwargs(self.solver_name, solver_kwargs)
+        self._check_solver_kwargs(
+            self._get_solver_class(self.solver_name), solver_kwargs
+        )
         self._solver_kwargs = solver_kwargs
 
     @staticmethod
-    def _check_solver_kwargs(solver_name, solver_kwargs):
+    def _check_solver_kwargs(solver_class, solver_kwargs):
         """
         Check if provided solver keyword arguments are valid.
 
         Parameters
         ----------
-        solver_name :
-            Name of the solver.
+        solver_class :
+            Class of the solver.
         solver_kwargs :
             Additional keyword arguments for the solver.
 
@@ -238,11 +240,11 @@ def _check_solver_kwargs(solver_name, solver_kwargs):
         NameError
             If any of the solver keyword arguments are not valid.
         """
-        solver_args = inspect.getfullargspec(getattr(jaxopt, solver_name)).args
+        solver_args = inspect.getfullargspec(solver_class).args
         undefined_kwargs = set(solver_kwargs.keys()).difference(solver_args)
         if undefined_kwargs:
             raise NameError(
-                f"kwargs {undefined_kwargs} in solver_kwargs not a kwarg for jaxopt.{solver_name}!"
+                f"kwargs {undefined_kwargs} in solver_kwargs not a kwarg for {solver_class.__name__}!"
             )
 
     def instantiate_solver(self, *args) -> BaseRegressor:
@@ -253,10 +255,10 @@ def instantiate_solver(self, *args) -> BaseRegressor:
         that initialize the solver state, update the model parameters, and run the optimization
         as attributes.
 
-        This method creates a solver instance from jaxopt library, tailored to the specific loss
-        function and regularization approach defined by the Regularizer instance. It also handles
-        the proximal operator if required for the optimization method. The returned functions are
-        directly usable in optimization loops, simplifying the syntax by pre-setting
+        This method creates a solver instance from nemos.solvers or the jaxopt library, tailored to
+        the specific loss function and regularization approach defined by the Regularizer instance.
+        It also handles the proximal operator if required for the optimization method. The returned
+        functions are directly usable in optimization loops, simplifying the syntax by pre-setting
         common arguments like regularization strength and other hyperparameters.
 
         Parameters
@@ -281,7 +283,7 @@ def instantiate_solver(self, *args) -> BaseRegressor:
         # only use penalized loss if not using proximal gradient descent
         # In proximal method you must use the unpenalized loss independently
         # of what regularizer you are using.
-        if self.solver_name != "ProximalGradient":
+        if self.solver_name not in ("ProximalGradient", "ProxSVRG"):
             loss = self.regularizer.penalized_loss(
                 self._predict_and_compute_loss, self.regularizer_strength
             )
@@ -295,7 +297,7 @@ def instantiate_solver(self, *args) -> BaseRegressor:
         utils.assert_is_callable(loss, "loss")
 
         # some parsing to make sure solver gets instantiated properly
-        if self.solver_name == "ProximalGradient":
+        if self.solver_name in ("ProximalGradient", "ProxSVRG"):
             if "prox" in self.solver_kwargs:
                 raise ValueError(
                     "Proximal operator specification is not permitted. "
@@ -315,7 +317,11 @@ def instantiate_solver(self, *args) -> BaseRegressor:
         ) = self._inspect_solver_kwargs(solver_kwargs)
 
         # instantiate the solver
-        solver = getattr(jaxopt, self.solver_name)(fun=loss, **solver_init_kwargs)
+        solver = self._get_solver_class(self.solver_name)(
+            fun=loss, **solver_init_kwargs
+        )
+
+        self._solver_loss_fun_ = loss
 
         def solver_run(
             init_params: Tuple[DESIGN_INPUT_TYPE, jnp.ndarray], *run_args: jnp.ndarray
@@ -327,10 +333,9 @@ def solver_update(params, state, *run_args, **run_kwargs) -> jaxopt.OptStep:
                 params, state, *args, *run_args, **solver_update_kwargs, **run_kwargs
             )
 
-        def solver_init_state(params, state, *run_args, **run_kwargs) -> NamedTuple:
+        def solver_init_state(params, *run_args, **run_kwargs) -> NamedTuple:
             return solver.init_state(
                 params,
-                state,
                 *run_args,
                 **run_kwargs,
                 **solver_init_state_kwargs,
@@ -372,7 +377,7 @@ def _inspect_solver_kwargs(
 
         if solver_kwargs:
             # instantiate a solver to then inspect the params of its various functions
-            solver = getattr(jaxopt, self.solver_name)
+            solver = self._get_solver_class(self.solver_name)
 
             for key, value in solver_kwargs.items():
                 if key in inspect.getfullargspec(solver.run).args:
@@ -540,3 +545,35 @@ def initialize_state(
     ) -> Union[Any, NamedTuple]:
         """Initialize the state of the solver for running fit and update."""
         pass
+
+    @staticmethod
+    def _get_solver_class(solver_name: str):
+        """
+        Find a solver class first looking in nemos.solvers, then in jaxopt.
+
+        Parameters
+        ----------
+        solver_name : str
+            Name of the solver class to load.
+
+        Returns
+        -------
+        solver_class :
+            Solver class ready to be instantiated.
+
+        Raises
+        ------
+        AttributeError
+            If a solver class with that name is not found.
+        """
+        try:
+            solver_class = getattr(solvers, solver_name)
+        except AttributeError:
+            try:
+                solver_class = getattr(jaxopt, solver_name)
+            except AttributeError:
+                raise AttributeError(
+                    f"Could not find {solver_name} in nemos.solvers or jaxopt"
+                )
+
+        return solver_class
@@ -1350,7 +1350,7 @@ def _check_n_basis_min(self) -> None:
 
 class MSplineBasis(SplineBasis):
     r"""
-    M-spline[$^1$](#references) basis functions for modeling and data transformation.
+    M-spline[$^{[1]}$](#references) basis functions for modeling and data transformation.
 
     M-splines are a type of spline basis function used for smooth curve fitting
     and data representation. They are positive and integrate to one, making them
@@ -1394,8 +1394,8 @@ class MSplineBasis(SplineBasis):
     >>> sample_points = linspace(0, 1, 100)
     >>> basis_functions = mspline_basis(sample_points)
 
-    References
-    ----------
+    # References
+    ------------
     [1] Ramsay, J. O. (1988). Monotone regression splines in action. Statistical science,
         3(4), 425-441.
 
@@ -1517,7 +1517,7 @@ def evaluate_on_grid(self, n_samples: int) -> Tuple[NDArray, NDArray]:
 
 class BSplineBasis(SplineBasis):
     """
-    B-spline[$^1$](#references) 1-dimensional basis functions.
+    B-spline[$^{[1]}$](#references) 1-dimensional basis functions.
 
     Parameters
     ----------
@@ -1546,9 +1546,9 @@ class BSplineBasis(SplineBasis):
         Spline order.
 
 
-    References
-    ----------
-    1. Prautzsch, H., Boehm, W., Paluszny, M. (2002). B-spline representation. In: Bézier and B-Spline Techniques.
+    # References
+    ------------
+    [1] Prautzsch, H., Boehm, W., Paluszny, M. (2002). B-spline representation. In: Bézier and B-Spline Techniques.
         Mathematics and Visualization. Springer, Berlin, Heidelberg. https://doi.org/10.1007/978-3-662-04919-8_5
 
     """
@@ -1779,7 +1779,7 @@ def evaluate_on_grid(self, n_samples: int) -> Tuple[NDArray, NDArray]:
 class RaisedCosineBasisLinear(Basis):
     """Represent linearly-spaced raised cosine basis functions.
 
-    This implementation is based on the cosine bumps used by Pillow et al.[$^1$](#references)
+    This implementation is based on the cosine bumps used by Pillow et al.[$^{[1]}$](#references)
     to uniformly tile the internal points of the domain.
 
     Parameters
@@ -1801,9 +1801,9 @@ class RaisedCosineBasisLinear(Basis):
         Only used in "conv" mode. Additional keyword arguments that are passed to
         `nemos.convolve.create_convolutional_predictor`
 
-    References
-    ----------
-    1. Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
+    # References
+    ------------
+    [1] Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
         C. E. (2005). Prediction and decoding of retinal ganglion cell responses
         with a probabilistic spiking model. Journal of Neuroscience, 25(47),
         11003–11013. http://dx.doi.org/10.1523/jneurosci.3305-05.2005
@@ -1964,7 +1964,7 @@ class RaisedCosineBasisLog(RaisedCosineBasisLinear):
     """Represent log-spaced raised cosine basis functions.
 
     Similar to `RaisedCosineBasisLinear` but the basis functions are log-spaced.
-    This implementation is based on the cosine bumps used by Pillow et al.[$^1$](#references)
+    This implementation is based on the cosine bumps used by Pillow et al.[$^{[1]}$](#references)
     to uniformly tile the internal points of the domain.
 
     Parameters
@@ -1994,9 +1994,9 @@ class RaisedCosineBasisLog(RaisedCosineBasisLinear):
         Only used in "conv" mode. Additional keyword arguments that are passed to
         `nemos.convolve.create_convolutional_predictor`
 
-    References
-    ----------
-    1. Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
+    # References
+    ------------
+    [1] Pillow, J. W., Paninski, L., Uzzel, V. J., Simoncelli, E. P., & J.,
        C. E. (2005). Prediction and decoding of retinal ganglion cell responses
        with a probabilistic spiking model. Journal of Neuroscience, 25(47),
        11003–11013. http://dx.doi.org/10.1523/jneurosci.3305-05.2005

@@ -622,19 +622,6 @@ def fit(
         else:
             data = X
 
-        # check if mask has been set is using group lasso
-        # if mask has not been set, use a single group as default
-        if isinstance(self.regularizer, GroupLasso):
-            if self.regularizer.mask is None:
-                warnings.warn(
-                    UserWarning(
-                        "Mask has not been set. Defaulting to a single group for all parameters. "
-                        "Please see the documentation on GroupLasso regularization for defining a "
-                        "mask."
-                    )
-                )
-                self.regularizer.mask = jnp.ones((1, data.shape[1]))
-
         self.initialize_state(data, y, init_params)
 
         params, state = self.solver_run(init_params, data, y)
@@ -882,13 +869,27 @@ def initialize_state(
         NamedTuple
             The initialized solver state
         """
-        #  set up the solver init/run/update attrs
-        self.instantiate_solver()
-
         if isinstance(X, FeaturePytree):
             data = X.data
         else:
             data = X
+
+        # check if mask has been set is using group lasso
+        # if mask has not been set, use a single group as default
+        if isinstance(self.regularizer, GroupLasso):
+            if self.regularizer.mask is None:
+                warnings.warn(
+                    UserWarning(
+                        "Mask has not been set. Defaulting to a single group for all parameters. "
+                        "Please see the documentation on GroupLasso regularization for defining a "
+                        "mask."
+                    )
+                )
+                self.regularizer.mask = jnp.ones((1, data.shape[1]))
+
+        #  set up the solver init/run/update attrs
+        self.instantiate_solver()
+
         opt_state = self.solver_init_state(init_params, data, y)
         return opt_state
 
@@ -1311,7 +1312,7 @@ def _check_mask(self, X, y, params):
             axis_2=1,
             err_message="Inconsistent number of neurons. "
             f"feature_mask has {jax.tree_util.tree_map(lambda m: m.shape[neural_axis], self.feature_mask)} neurons, "
-            f"model coefficients have {jax.tree_util.tree_map(lambda x: x.shape[1], X)}  instead!",
+            f"model coefficients have {jax.tree_util.tree_map(lambda x: x.shape[1], params[0])}  instead!",
         )
 
     @cast_to_jax

@@ -267,8 +267,8 @@ def pseudo_r2(
     ) -> jnp.ndarray:
         r"""Pseudo-$R^2$ calculation for a GLM.
 
-        Compute the pseudo-$R^2$ metric for the GLM, as defined by McFadden et al.[$^1$](#references)
-        or by Cohen et al.[$^2$](#references).
+        Compute the pseudo-$R^2$ metric for the GLM, as defined by McFadden et al.[$^{[1]}$](#references)
+        or by Cohen et al.[$^{[2]}$](#references).
 
         This metric evaluates the goodness-of-fit of the model relative to a null (baseline) model that assumes a
         constant mean for the observations. While the pseudo-$R^2$ is bounded between 0 and 1 for the training set,
@@ -311,13 +311,13 @@ def pseudo_r2(
          sample, i.e. the maximum value that the likelihood could possibly achieve). $D_M$ and $D_0$ are
          the model and the null deviance, $D_i = -2 \left[ \log(L_s) - \log(L_i) \right]$ for $i=M,0$.
 
-
-        References
-        ----------
-        1. McFadden D (1979). Quantitative methods for analysing travel behavior of individuals: Some recent
+        # References
+        ------------
+        [1] McFadden D (1979). Quantitative methods for analysing travel behavior of individuals: Some recent
         developments. In D. A. Hensher & P. R. Stopher (Eds.), *Behavioural travel modelling* (pp. 279-318).
         London: Croom Helm.
-        2. Jacob Cohen, Patricia Cohen, Steven G. West, Leona S. Aiken.
+
+        [2] Jacob Cohen, Patricia Cohen, Steven G. West, Leona S. Aiken.
         *Applied Multiple Regression/Correlation Analysis for the Behavioral Sciences*.
         3rd edition. Routledge, 2002. p.502. ISBN 978-0-8058-2223-6. (May 2012)
         """

@@ -24,7 +24,7 @@
 [1]  Parikh, Neal, and Stephen Boyd. *"Proximal Algorithms, ser. Foundations and Trends (r) in Optimization."* (2013).
 """
 
-from typing import Tuple
+from typing import Any, Optional, Tuple
 
 import jax
 import jax.numpy as jnp
@@ -132,6 +132,7 @@ def prox_group_lasso(
 
     """
     weights, intercepts = params
+    shape = weights.shape
     # divide the reg strength by the number of neurons
     regularizer_strength /= intercepts.shape[0]
     # add an extra dim if not 2D, do nothing otherwise.
@@ -143,4 +144,42 @@ def prox_group_lasso(
     # Avoid shrinkage of features that do not belong to any group
     # by setting the shrinkage factor to 1.
     not_regularized = jnp.outer(jnp.ones(factor.shape[0]), 1 - mask.sum(axis=0))
-    return jnp.squeeze(weights * (factor @ mask + not_regularized)).T, intercepts
+    return (weights * (factor @ mask + not_regularized)).T.reshape(shape), intercepts
+
+
+def prox_lasso(x: Any, l1reg: Optional[Any] = None, scaling: float = 1.0) -> Any:
+    r"""Proximal operator for the l1 norm, i.e., soft-thresholding operator.
+
+    Minimizes the following function:
+
+    $$
+      \underset{y}{\text{argmin}} ~ \frac{1}{2} ||x - y||\_2^2
+      + \text{scaling} \cdot \text{l1reg} \cdot ||y||\_1
+    $$
+
+    When `l1reg` is a pytree, the weights are applied coordinate-wise.
+
+    Parameters
+    ----------
+    x :
+        Input pytree.
+    l1reg :
+        Regularization strength, float or pytree with the same structure as `x`. Default is None.
+    scaling : float, optional
+        A scaling factor. Default is 1.0.
+
+    Returns
+    -------
+    :
+        Output pytree with the same structure as `x`.
+    """
+    if l1reg is None:
+        l1reg = 1.0
+
+    if jnp.isscalar(l1reg):
+        l1reg = jax.tree_util.tree_map(lambda y: l1reg * jnp.ones_like(y), x)
+
+    def fun(u, v):
+        return jnp.sign(u) * jax.nn.relu(jnp.abs(u) - v * scaling)
+
+    return jax.tree_util.tree_map(fun, x, l1reg)