From 1fe1caf73288b4c7bbdc4b3d151469ee0fc87d0e Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 8 Apr 2023 19:00:25 -0400 Subject: [PATCH 01/56] FEAT: Add 2D versions of some morphological algorithms Currently implemented versions are mor, imor, rolling_ball, and tophat. Note that this is all experimental at this point. Design decision: no functional interface will be provided for the 2D versions. --- pybaselines/__init__.py | 1 + pybaselines/two_d/_algorithm_setup.py | 448 ++++++++++++++++++++++++++ pybaselines/two_d/api.py | 62 ++++ pybaselines/two_d/morphological.py | 318 ++++++++++++++++++ pybaselines/two_d/smooth.py | 10 + 5 files changed, 839 insertions(+) create mode 100644 pybaselines/two_d/_algorithm_setup.py create mode 100644 pybaselines/two_d/api.py create mode 100644 pybaselines/two_d/morphological.py create mode 100644 pybaselines/two_d/smooth.py diff --git a/pybaselines/__init__.py b/pybaselines/__init__.py index 0e4de78..c8448ac 100644 --- a/pybaselines/__init__.py +++ b/pybaselines/__init__.py @@ -101,3 +101,4 @@ ) from .api import Baseline +from .two_d.api import Baseline2D diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py new file mode 100644 index 0000000..fca792a --- /dev/null +++ b/pybaselines/two_d/_algorithm_setup.py @@ -0,0 +1,448 @@ +# -*- coding: utf-8 -*- +"""Setup code for the various algorithm types in pybaselines. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +from contextlib import contextmanager +from functools import partial, wraps + +import numpy as np +from scipy.ndimage import grey_opening + +from .._validation import ( + _check_array, _check_half_window, _check_sized_array, _yx_arrays +) +from ..utils import _inverted_sort, pad_edges, relative_difference + + +class _Algorithm2D: + """ + A base class for all 2D algorithm types. + + Contains setup methods for all algorithm types to make more complex algorithms + easier to set up. + + Attributes + ---------- + poly_order : int + The last polynomial order used for a polynomial algorithm. Initially is -1, denoting + that no polynomial fitting has been performed. + pspline : PSpline or None + The PSpline object for setting up and solving penalized spline algorithms. Is None + if no penalized spline setup has been performed (typically done in :meth:`._setup_spline`). + vandermonde : numpy.ndarray or None + The Vandermonde matrix for solving polynomial equations. Is None if no polynomial + setup has been performed (typically done in :meth:`._setup_polynomial`). + whittaker_system : PenalizedSystem or None + The PenalizedSystem object for setting up and solving Whittaker-smoothing-based + algorithms. Is None if no Whittaker setup has been performed (typically done in + :meth:`_setup_whittaker`). + x : numpy.ndarray or None + The x-values for the object. If initialized with None, then `x` is initialized the + first function call to have the same length as the input `data` and has min and max + values of -1 and 1, respectively. + x_domain : numpy.ndarray + The minimum and maximum values of `x`. If `x_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + + """ + + def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=None): + """ + Initializes the algorithm object. + + Parameters + ---------- + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + z_data : array-like, shape (N,), optional + The z-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + check_finite : bool, optional + If True (default), will raise an error if any values in input data are not finite. + Setting to False will skip the check. Note that errors may occur if + `check_finite` is False and the input data contains non-finite values. + output_dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing + of the input data. + + Notes + ----- + Unlike `_Algorithm`, `_2DAlgorithm` does not sort input data. + + """ + if x_data is None: + self.x = None + self.x_domain = np.array([-1., 1.]) + self._len = None + else: + self.x = _check_array(x_data, check_finite=check_finite) + self._len = len(self.x) + self.x_domain = np.polynomial.polyutils.getdomain(self.x) + + if z_data is None: + self.z = None + self.z_domain = np.array([-1., 1.]) + self._len = None + else: + self.z = _check_array(z_data, check_finite=check_finite) + self._len = len(self.z) + self.z_domain = np.polynomial.polyutils.getdomain(self.z) + + self.whittaker_system = None + self.vandermonde = None + self.poly_order = -1 + self.pspline = None + self._check_finite = check_finite + self._dtype = output_dtype + + def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1): + """ + Re-orders the input baseline and parameters based on the x ordering. + + If `self._sort_order` is None, then no reordering is performed. + + Parameters + ---------- + baseline : numpy.ndarray, shape (N,) + The baseline output by the baseline function. + params : dict + The parameter dictionary output by the baseline function. + dtype : [type] + The desired output dtype for the baseline. + sort_keys : Iterable, optional + An iterable of keys corresponding to the values in `params` that need + re-ordering. Default is (). + axis : int, optional + The axis of the input which defines each unique set of data. Default is -1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The input `baseline` after re-ordering and setting to the desired dtype. + params : dict + The input `params` after re-ordering the values for `sort_keys`. + + """ + baseline = baseline.astype(dtype, copy=False) + + return baseline, params + + @classmethod + def _register(cls, func=None, *, dtype=None, order=None, ensure_1d=True, axis=-1): + """ + Wraps a baseline function to validate inputs and correct outputs. + + The input data is converted to a numpy array, validated to ensure the length is + consistent, and ordered to match the input x ordering. The outputs are corrected + to ensure proper inverted sort ordering and dtype. + + Parameters + ---------- + func : Callable, optional + The function that is being decorated. Default is None, which returns a partial function. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + axis : int, optional + The axis of the input on which to check its length. Default is -1. + + Returns + ------- + numpy.ndarray + The calculated baseline. + dict + A dictionary of parameters output by the baseline function. + + """ + if func is None: + return partial( + cls._register, dtype=dtype, order=order, ensure_1d=ensure_1d, axis=axis + ) + + @wraps(func) + def inner(self, data=None, *args, **kwargs): + if self.x is None: + if data is None: + raise TypeError('"data" and "x_data" cannot both be None') + reset_x = False + input_y = True + y, self.x = _yx_arrays( + data, check_finite=self._check_finite, dtype=dtype, order=order, + ensure_1d=False, axis=axis + ) + self._len = y.shape[axis] + else: + reset_x = True + if data is not None: + input_y = True + y = _check_sized_array( + data, self._len, check_finite=self._check_finite, dtype=dtype, order=order, + ensure_1d=False, axis=axis, name='data' + ) + else: + y = data + input_y = False + # update self.x just to ensure dtype and order are correct + x_dtype = self.x.dtype + self.x = _check_array( + self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False + ) + + if input_y and self._dtype is None: + output_dtype = y.dtype + else: + output_dtype = self._dtype + + baseline, params = func(self, y, *args, **kwargs) + if reset_x: + self.x = np.array(self.x, dtype=x_dtype, copy=False) + + return self._return_results(baseline, params, output_dtype, axis) + + return inner + + @contextmanager + def _override_x(self, new_x, new_sort_order=None): + """ + Temporarily sets the x-values for the object to a different array. + + Useful when fitting extensions of the x attribute. + + Parameters + ---------- + new_x : numpy.ndarray + The x values to temporarily use. + new_sort_order : [type], optional + The sort order for the new x values. Default is None, which will not sort. + + Yields + ------ + pybaselines._algorithm_setup._Algorithm + The _Algorithm object with the new x attribute. + + """ + old_x = self.x + old_len = self._len + old_x_domain = self.x_domain + old_sort_order = self._sort_order + old_inverted_order = self._inverted_order + # also have to reset any sized attributes to force recalculation for new x + old_poly_order = self.poly_order + old_vandermonde = self.vandermonde + old_whittaker_system = self.whittaker_system + old_pspline = self.pspline + + try: + self.x = _check_array(new_x, check_finite=self._check_finite) + self._len = len(self.x) + self.x_domain = np.polynomial.polyutils.getdomain(self.x) + self._sort_order = new_sort_order + if self._sort_order is not None: + self._inverted_order = _inverted_sort(self._sort_order) + else: + self._inverted_order = None + + self.vandermonde = None + self.poly_order = -1 + self.whittaker_system = None + self.pspline = None + + yield self + + finally: + self.x = old_x + self._len = old_len + self.x_domain = old_x_domain + self._sort_order = old_sort_order + self._inverted_order = old_inverted_order + self.vandermonde = old_vandermonde + self.poly_order = old_poly_order + self.whittaker_system = old_whittaker_system + self.pspline = old_pspline + + def _setup_morphology(self, y, half_window=None, **window_kwargs): + """ + Sets the starting parameters for morphology-based methods. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + half_window : int, optional + The half-window used for the morphology functions. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using pybaselines.morphological.optimize_window. + **window_kwargs + Keyword arguments to pass to :func:`.optimize_window`. + Possible items are: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 3. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable half-window size. If None (default), will be + set to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + output_half_window : int + The accepted half window size. + + Notes + ----- + Ensures that window size is odd since morphological operations operate in + the range [-output_half_window, ..., output_half_window]. + + Half windows are dealt with rather than full window sizes to clarify their + usage. SciPy morphology operations deal with full window sizes. + + """ + if half_window is not None: + output_half_window = _check_half_window(half_window) + else: + output_half_window = _optimize_window(y, **window_kwargs) + + return y, output_half_window + + def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): + """ + Sets the starting parameters for doing smoothing-based algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + half_window : int, optional + The half-window used for the smoothing functions. Used + to pad the left and right edges of the data to reduce edge + effects. Default is 0, which provides no padding. + allow_zero : bool, optional + If True (default), allows `half_window` to be 0; otherwise, `half_window` + must be at least 1. + **pad_kwargs + Additional keyword arguments to pass to :func:`.pad_edges` for padding + the edges of the data to prevent edge effects from smoothing. + + Returns + ------- + numpy.ndarray, shape (``N + 2 * half_window``,) + The padded array of data. + + """ + hw = _check_half_window(half_window, allow_zero) + return pad_edges(y, hw, **pad_kwargs) + + def _setup_misc(self, y): + """ + Sets the starting parameters for doing miscellaneous algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + + Notes + ----- + Since the miscellaneous functions are not related, the only use of this + function is for aliasing the input `data` to `y`. + + """ + return y + + +# TODO maybe just make a way to merge the 1D and 2D versions +def _optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, + max_half_window=None, min_half_window=None): + """ + Optimizes the morphological half-window size. + + Parameters + ---------- + data : array-like, shape (N,) + The measured data values. + increment : int, optional + The step size for iterating half windows. Default is 1. + max_hits : int, optional + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the optimum + value. Default is 3. + window_tol : float, optional + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + max_half_window : int, optional + The maximum allowable half-window size. If None (default), will be set + to (len(data) - 1) / 2. + min_half_window : int, optional + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + half_window : int + The optimized half window size. + + Notes + ----- + May only provide good results for some morphological algorithms, so use with + caution. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y = np.asarray(data) + if max_half_window is None: + max_half_window = (y.shape[0] - 1) // 2 + if min_half_window is None: + min_half_window = 1 + + # TODO would it be better to allow padding the data? + opening = grey_opening(y, [2 * min_half_window + 1, 2 * min_half_window + 1]) + hits = 0 + best_half_window = min_half_window + for half_window in range(min_half_window + increment, max_half_window, increment): + new_opening = grey_opening(y, [half_window * 2 + 1, half_window * 2 + 1]) + if relative_difference(opening, new_opening) < window_tol: + if hits == 0: + # keep just the first window that fits tolerance + best_half_window = half_window - increment + hits += 1 + if hits >= max_hits: + half_window = best_half_window + break + elif hits: + hits = 0 + opening = new_opening + + return max(half_window, 1) # ensure half window is at least 1 diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py new file mode 100644 index 0000000..d40909d --- /dev/null +++ b/pybaselines/two_d/api.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +"""The main entry point for using the object oriented api of pybaselines. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +from .morphological import _Morphological +from .smooth import _Smooth + + +class Baseline2D( + _Morphological, _Smooth +): + """ + A class for all 2D baseline correction algorithms. + + Contains all available baseline correction algorithms in pybaselines as methods to + allow a single interface for easier usage. + + Parameters + ---------- + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + z_data : array-like, shape (N,), optional + The z-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + check_finite : bool, optional + If True (default), will raise an error if any values in input data are not finite. + Setting to False will skip the check. Note that errors may occur if + `check_finite` is False and the input data contains non-finite values. + output_dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing + of the input data. + + Attributes + ---------- + poly_order : int + The last polynomial order used for a polynomial algorithm. Initially is -1, denoting + that no polynomial fitting has been performed. + pspline : pybaselines._spline_utils.PSpline or None + The PSpline object for setting up and solving penalized spline algorithms. Is None + if no penalized spline setup has been performed. + vandermonde : numpy.ndarray or None + The Vandermonde matrix for solving polynomial equations. Is None if no polynomial + setup has been performed. + whittaker_system : pybaselines._banded_utils.PenalizedSystem or None + The PenalizedSystem object for setting up and solving Whittaker-smoothing-based + algorithms. Is None if no Whittaker setup has been performed. + x : numpy.ndarray or None + The x-values for the object. If initialized with None, then `x` is initialized the + first function call to have the same length as the input `data` and has min and max + values of -1 and 1, respectively. + x_domain : numpy.ndarray + The minimum and maximum values of `x`. If `x_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + + """ diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py new file mode 100644 index 0000000..f7e67e6 --- /dev/null +++ b/pybaselines/two_d/morphological.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- +"""Morphological techniques for fitting baselines to experimental data. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +import numpy as np +from scipy.ndimage import grey_dilation, grey_erosion, grey_opening, uniform_filter + +from ._algorithm_setup import _Algorithm2D +from ..utils import ( + relative_difference +) + + +class _Morphological(_Algorithm2D): + """A base class for all morphological algorithms.""" + + @_Algorithm2D._register + def mor(self, data, half_window=None, **window_kwargs): + """ + A Morphological based (Mor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + half_window : int, optional + The half-window used for the morphology functions. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using :func:`.optimize_window` and `window_kwargs`. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': int + The half window used for the morphological calculations. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + opening = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) + + return baseline, {'half_window': half_wind} + + @_Algorithm2D._register + def imor(self, data, half_window=None, tol=1e-3, max_iter=200, **window_kwargs): + """ + An Improved Morphological based (IMor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + half_window : int, optional + The half-window used for the morphology functions. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using :func:`.optimize_window` and `window_kwargs`. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 200. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'half_window': int + The half window used for the morphological calculations. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Dai, L., et al. An Automated Baseline Correction Method Based on Iterative + Morphological Operations. Applied Spectroscopy, 2018, 72(5), 731-739. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + baseline = y + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline_new = np.minimum(y, _avg_opening(baseline, half_wind)) + calc_difference = relative_difference(baseline, baseline_new) + tol_history[i] = calc_difference + if calc_difference < tol: + break + baseline = baseline_new + + params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} + return baseline, params + + @_Algorithm2D._register + def rolling_ball(self, data, half_window=None, smooth_half_window=None, + pad_kwargs=None, **window_kwargs): + """ + The rolling ball baseline algorithm. + + Applies a minimum and then maximum moving window, and subsequently smooths the + result, giving a baseline that resembles rolling a ball across the data. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + half_window : int, optional + The half-window used for the morphology functions. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using :func:`.optimize_window` and `window_kwargs`. + smooth_half_window : int, optional + The half-window to use for smoothing the data after performing the + morphological operation. Default is None, which will use the same + value as used for the morphological operation. + pad_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.pad_edges` for + padding the edges of the data to prevent edge effects from the moving average. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': int or numpy.ndarray(int) + The half window or array of half windows used for the + morphological calculations. + + Notes + ----- + To use a changing window size for either the morphological or smoothing + operations, the half windows must be arrays. Otherwise, the size of the + rolling ball is assumed to be constant. + + References + ---------- + Kneen, M.A., et al. Algorithm for fitting XRF, SEM and PIXE X-ray spectra + backgrounds. Nuclear Instruments and Methods in Physics Research B, 1996, + 109, 209-213. + + Liland, K., et al. Optimal Choice of Baseline Correction for Multivariate + Calibration of Spectra. Applied Spectroscopy, 2010, 64(9), 1007-1016. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + if smooth_half_window is None: + smooth_half_window = half_wind + + rough_baseline = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + baseline = uniform_filter( + rough_baseline, [2 * smooth_half_window + 1, 2 * smooth_half_window + 1] + ) + + return baseline, {'half_window': half_wind} + + @_Algorithm2D._register + def tophat(self, data, half_window=None, **window_kwargs): + """ + Estimates the baseline using a top-hat transformation (morphological opening). + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + half_window : int, optional + The half-window used for the morphological opening. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using :func:`.optimize_window` and `window_kwargs`. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': int + The half window used for the morphological calculations. + + Notes + ----- + The actual top-hat transformation is defined as `data - opening(data)`, where + `opening` is the morphological opening operation. This function, however, returns + `opening(data)`, since that is technically the baseline defined by the operation. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + baseline = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + + return baseline, {'half_window': half_wind} + + +def _avg_opening(y, half_window, opening=None): + """ + Averages the dilation and erosion of a morphological opening on data. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The array of the measured data. + half_window : int, optional + The half window size to use for the operations. + opening : numpy.ndarray, optional + The output of scipy.ndimage.grey_opening(y, window_size). Default is + None, which will compute the value. + + Returns + ------- + numpy.ndarray, shape (N,) + The average of the dilation and erosion of the opening. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. + + """ + window_size = 2 * half_window + 1 + if opening is None: + opening = grey_opening(y, [window_size, window_size]) + return 0.5 * ( + grey_dilation(opening, [window_size, window_size]) + + grey_erosion(opening, [window_size, window_size]) + ) diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py new file mode 100644 index 0000000..1e5517a --- /dev/null +++ b/pybaselines/two_d/smooth.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +"""Smoothing-based techniques for fitting baselines to experimental data. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +class _Smooth: + pass \ No newline at end of file From e7810281279dd8ae50bdd11acb4c9082b28f0a83 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 16 Apr 2023 20:27:54 -0400 Subject: [PATCH 02/56] FEAT: Add 2D versions for most polynomial algorithms Implemented the poly, modpoly, imodpoly, penalized_poly, and goldindec algorithms (didn't have to actually do anything outside of the polynomial setup). Had to skip several validations, so need to add that back in later. --- pybaselines/polynomial.py | 2 +- pybaselines/two_d/_algorithm_setup.py | 112 +++- pybaselines/two_d/_validation.py | 391 ++++++++++++ pybaselines/two_d/api.py | 3 +- pybaselines/two_d/polynomial.py | 886 ++++++++++++++++++++++++++ 5 files changed, 1388 insertions(+), 6 deletions(-) create mode 100644 pybaselines/two_d/_validation.py create mode 100644 pybaselines/two_d/polynomial.py diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py index eb33bab..f1791c3 100644 --- a/pybaselines/polynomial.py +++ b/pybaselines/polynomial.py @@ -245,7 +245,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, @_Algorithm._register(sort_keys=('weights',)) def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1): + use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1.): """ The improved modofied polynomial (IModPoly) baseline algorithm. diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index fca792a..13211b0 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -12,8 +12,8 @@ import numpy as np from scipy.ndimage import grey_opening -from .._validation import ( - _check_array, _check_half_window, _check_sized_array, _yx_arrays +from ._validation import ( + _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays ) from ..utils import _inverted_sort, pad_edges, relative_difference @@ -135,7 +135,8 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1): return baseline, params @classmethod - def _register(cls, func=None, *, dtype=None, order=None, ensure_1d=True, axis=-1): + def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d=True, + axis=-1, reshape_baseline=False, reshape_keys=()): """ Wraps a baseline function to validate inputs and correct outputs. @@ -147,6 +148,9 @@ def _register(cls, func=None, *, dtype=None, order=None, ensure_1d=True, axis=-1 ---------- func : Callable, optional The function that is being decorated. Default is None, which returns a partial function. + sort_keys : tuple, optional + The keys within the output parameter dictionary that will need sorting to match the + sort order of :attr:`.x`. Default is (). dtype : type or numpy.dtype, optional The dtype to cast the output array. Default is None, which uses the typing of `array`. order : {None, 'C', 'F'}, optional @@ -157,6 +161,13 @@ def _register(cls, func=None, *, dtype=None, order=None, ensure_1d=True, axis=-1 array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). axis : int, optional The axis of the input on which to check its length. Default is -1. + reshape_baseline : bool, optional + If True, will reshape the output baseline back into the shape of the input data. If + False (default), will not modify the output baseline shape. + reshape_keys : tuple, optional + The keys within the output parameter dictionary that will need reshaped to match the + shape of the data. For example, used to convert weights for polynomials from 1D back + into the original shape. Default is (). Returns ------- @@ -168,11 +179,13 @@ def _register(cls, func=None, *, dtype=None, order=None, ensure_1d=True, axis=-1 """ if func is None: return partial( - cls._register, dtype=dtype, order=order, ensure_1d=ensure_1d, axis=axis + cls._register, dtype=dtype, order=order, ensure_1d=ensure_1d, axis=axis, + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys ) @wraps(func) def inner(self, data=None, *args, **kwargs): + """ # TODO add back in later if self.x is None: if data is None: raise TypeError('"data" and "x_data" cannot both be None') @@ -200,12 +213,18 @@ def inner(self, data=None, *args, **kwargs): self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False ) + """ + y = data; input_y = True; reset_x = False; x_dtype = None # TODO remove later + if input_y and self._dtype is None: output_dtype = y.dtype else: output_dtype = self._dtype + y_shape = y.shape # TODO remove later and move somewhere else baseline, params = func(self, y, *args, **kwargs) + if reshape_baseline: + baseline = baseline.reshape(y_shape) if reset_x: self.x = np.array(self.x, dtype=x_dtype, copy=False) @@ -272,6 +291,91 @@ def _override_x(self, new_x, new_sort_order=None): self.whittaker_system = old_whittaker_system self.pspline = old_pspline + def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, + calc_pinv=False, copy_weights=False): + """ + Sets the starting parameters for doing polynomial fitting. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + poly_order : int, optional + The polynomial order. Default is 2. + calc_vander : bool, optional + If True, will calculate and the Vandermonde matrix. Default is False. + calc_pinv : bool, optional + If True, and if `return_vander` is True, will calculate and return the + pseudo-inverse of the Vandermonde matrix. Default is False. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,) + The weight array for fitting a polynomial to the data. + pseudo_inverse : numpy.ndarray + Only returned if `calc_pinv` is True. The pseudo-inverse of the + Vandermonde matrix, calculated with singular value decomposition (SVD). + + Raises + ------ + ValueError + Raised if `calc_pinv` is True and `calc_vander` is False. + + Notes + ----- + If x_data is given, its domain is reduced from ``[min(x_data), max(x_data)]`` + to [-1., 1.] to improve the numerical stability of calculations; since the + Vandermonde matrix goes from ``x**0`` to ``x^**poly_order``, large values of + x would otherwise cause difficulty when doing least squares minimization. + + """ + weight_array = _check_optional_array( + y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape + ) + weight_array = weight_array.ravel() + # TODO + #if self._sort_order is not None and weights is not None: + # weight_array = weight_array[self._sort_order] + + if calc_vander: + if self.vandermonde is None or poly_order > self.poly_order: + mapped_x = np.polynomial.polyutils.mapdomain( + self.x, self.x_domain, np.array([-1., 1.]) + ) + mapped_z = np.polynomial.polyutils.mapdomain( + self.z, self.z_domain, np.array([-1., 1.]) + ) + # rearrange the vandermonde such that it matches the typical A c = b where b + # is the flattened version of y and c are the coefficients + self.vandermonde = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z), [poly_order, poly_order] + ).reshape((-1, (poly_order + 1) * (poly_order + 1))) + + elif poly_order < self.poly_order: + pass #self.vandermonde = self.vandermonde[:, :poly_order + 1] + self.poly_order = poly_order + + if not calc_pinv: + return y, weight_array + elif not calc_vander: + raise ValueError('if calc_pinv is True, then calc_vander must also be True') + + if weights is None: + pseudo_inverse = np.linalg.pinv(self.vandermonde) + else: + pseudo_inverse = np.linalg.pinv(np.sqrt(weight_array)[:, None] * self.vandermonde) + + return y.ravel(), weight_array, pseudo_inverse + def _setup_morphology(self, y, half_window=None, **window_kwargs): """ Sets the starting parameters for morphology-based methods. diff --git a/pybaselines/two_d/_validation.py b/pybaselines/two_d/_validation.py new file mode 100644 index 0000000..f34aa1e --- /dev/null +++ b/pybaselines/two_d/_validation.py @@ -0,0 +1,391 @@ +# -*- coding: utf-8 -*- +"""Code for validating inputs. + +Created on April 16, 2023 +@author: Donald Erb + +""" + +import numpy as np + + +def _check_scalar(data, desired_length, fill_scalar=False, **asarray_kwargs): + """ + Checks if the input is scalar and potentially coerces it to the desired length. + + Only intended for one dimensional data. + + Parameters + ---------- + data : array-like + Either a scalar value or an array. Array-like inputs with only 1 item will also + be considered scalar. + desired_length : int + If `data` is an array, `desired_length` is the length the array must have. If `data` + is a scalar and `fill_scalar` is True, then `desired_length` is the length of the output. + fill_scalar : bool, optional + If True and `data` is a scalar, then will output an array with a length of + `desired_length`. Default is False, which leaves scalar values unchanged. + **asarray_kwargs : dict + Additional keyword arguments to pass to :func:`numpy.asarray`. + + Returns + ------- + output : numpy.ndarray or numpy.number + The array of values or the single array scalar, depending on the input parameters. + is_scalar : bool + True if the input was a scalar value or had a length of 1; otherwise, is False. + + Raises + ------ + ValueError + Raised if `data` is not a scalar and its length is not equal to `desired_length`. + + """ + output = np.asarray(data, **asarray_kwargs) + ndim = output.ndim + if not ndim: + is_scalar = True + else: + if ndim > 1: # coerce to 1d shape + output = output.reshape(-1) + len_output = len(output) + if len_output == 1: + is_scalar = True + output = np.asarray(output[0], **asarray_kwargs) + else: + is_scalar = False + + if is_scalar: + if fill_scalar: + output = np.full(desired_length, output) + else: + # index with an empty tuple to get the single scalar while maintaining the numpy dtype + output = output[()] + elif len_output != desired_length: + raise ValueError(f'desired length was {desired_length} but instead got {len_output}') + + return output, is_scalar + + +def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarray_kwargs): + """ + Ensures the input is a scalar value. + + Parameters + ---------- + value : float or array-like + The value to check. + allow_zero : bool, optional + If False (default), only allows `value` > 0. If True, allows `value` >= 0. + variable_name : str, optional + The name displayed if an error occurs. Default is 'lam'. + **asarray_kwargs : dict + Additional keyword arguments to pass to :func:`numpy.asarray`. + + Returns + ------- + output : float + The verified scalar value. + + Raises + ------ + ValueError + Raised if `value` is less than or equal to 0 if `allow_zero` is False or + less than 0 if `allow_zero` is True. + + """ + output = _check_scalar(value, 1, fill_scalar=False, **asarray_kwargs)[0] + if allow_zero: + operation = np.less + text = 'greater than or equal to' + else: + operation = np.less_equal + text = 'greater than' + if np.any(operation(output, 0)): + raise ValueError(f'{variable_name} must be {text} 0') + + # use an empty tuple to get the single scalar value + return output + + +def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True): + """ + Validates the shape and values of the input array and controls the output parameters. + + Parameters + ---------- + array : array-like + The input array to check. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + check_finite : bool, optional + If True, will raise an error if any values in `array` are not finite. Default is False, + which skips the check. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + + Returns + ------- + output : numpy.ndarray + The array after performing all validations. + + Raises + ------ + ValueError + Raised if `ensure_1d` is True and `array` does not have a shape of (N,) or + (N, 1) or (1, N). + + Notes + ----- + If `ensure_1d` is True and `array` has a shape of (N, 1) or (1, N), it is reshaped to + (N,) for better compatibility for all functions. + + """ + if check_finite: + array_func = np.asarray_chkfinite + else: + array_func = np.asarray + output = array_func(array, dtype=dtype, order=order) + if ensure_1d: + output = np.array(output, copy=False, ndmin=1) + dimensions = output.ndim + if dimensions == 2 and 1 in output.shape: + output = output.reshape(-1) + elif dimensions != 1: + raise ValueError('must be a one dimensional array') + + return output + + +def _check_sized_array(array, length, dtype=None, order=None, check_finite=False, + ensure_1d=True, axis=-1, name='weights'): + """ + Validates the input array and ensures its length is correct. + + Parameters + ---------- + array : array-like + The input array to check. + length : int + The length that the input should have on the specified `axis`. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + check_finite : bool, optional + If True, will raise an error if any values if `array` are not finite. Default is False, + which skips the check. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + axis : int, optional + The axis of the input on which to check its length. Default is -1. + name : str, optional + The name for the variable if an exception is raised. Default is 'weights'. + + Returns + ------- + output : numpy.ndarray + The array after performing all validations. + + Raises + ------ + ValueError + Raised if `array` does not match `length` on the given `axis`. + + """ + output = _check_array( + array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d + ) + if output.shape[axis] != length: + raise ValueError( + f'length mismatch for {name}; expected {length} but got {output.shape[axis]}' + ) + return output + + +def _yx_arrays(data, x_data=None, check_finite=False, dtype=None, order=None, ensure_1d=True, + axis=-1): + """ + Converts input data into numpy arrays and provides x data if none is given. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1. to 1. with N points. + check_finite : bool, optional + If True, will raise an error if any values if `array` are not finite. Default is False, + which skips the check. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + axis : int, optional + The axis of the input on which to check its length. Default is -1. + + Returns + ------- + y : numpy.ndarray, shape (N,) + A numpy array of the y-values of the measured data. + x : numpy.ndarray, shape (N,) + A numpy array of the x-values of the measured data, or a created array. + + Notes + ----- + Does not change the scale/domain of the input `x_data` if it is given, only + converts it to an array. + + """ + y = _check_array( + data, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d + ) + len_y = y.shape[axis] + if x_data is None: + x = np.linspace(-1, 1, len_y) + else: + x = _check_sized_array( + x_data, len_y, dtype=dtype, order=order, check_finite=check_finite, + ensure_1d=True, axis=0, name='x_data' + ) + + return y, x + + +def _check_lam(lam, allow_zero=False): + """ + Ensures the regularization parameter `lam` is a scalar greater than 0. + + Parameters + ---------- + lam : float or array-like + The regularization parameter, lambda, used in Whittaker smoothing and + penalized splines. + allow_zero : bool + If False (default), only allows `lam` values > 0. If True, allows `lam` >= 0. + + Returns + ------- + float + The scalar `lam` value. + + Raises + ------ + ValueError + Raised if `lam` is less than or equal to 0. + + Notes + ----- + Array-like `lam` values could be permitted, but they require using the full + banded penalty matrix. Many functions use only half of the penalty matrix due + to its symmetry; that symmetry is broken when using an array for `lam`, so allowing + an array `lam` would change how the system is solved. Further, array-like `lam` + values with large changes in scale cause some instability and/or discontinuities + when using Whittaker smoothing or penalized splines. Thus, it is easier and better + to only allow scalar `lam` values. + + TODO will maybe change this in the future to allow array-like `lam`, and the + solver will be determined based on that; however, until then, want to ensure users + don't unknowingly use an array-like `lam` when it doesn't work. + NOTE for future: if multiplying an array `lam` with the penalties in banded format, + do not reverse the order (ie. keep it like the output of sparse.dia.data), multiply + by the array, and then shift the rows based on the difference order (same procedure + as done for aspls). That will give the same output as + ``(diags(lam) @ D.T @ D).todia().data[::-1]``. + + """ + return _check_scalar_variable(lam, allow_zero) + + +def _check_half_window(half_window, allow_zero=False): + """ + Ensures the half-window is an integer and has an appropriate value. + + Parameters + ---------- + half_window : int, optional + The half-window used for the smoothing functions. Used + to pad the left and right edges of the data to reduce edge + effects. Default is 0, which provides no padding. + allow_zero : bool, optional + If True, allows `half_window` to be 0; otherwise, `half_window` + must be at least 1. Default is False. + + Returns + ------- + output_half_window : int + The verified half-window value. + + Raises + ------ + TypeError + Raised if the integer converted `half_window` is not equal to the input + `half_window`. + + """ + output_half_window = _check_scalar_variable( + half_window, allow_zero, 'half_window', dtype=np.intp + ) + if output_half_window != half_window: + raise TypeError('half_window must be an integer') + + return output_half_window + + +def _check_optional_array(data_size, array=None, dtype=None, order=None, check_finite=False, + copy_input=False, name='weights', ensure_1d=True): + """ + Validates the length of the input array or creates an array of ones if no input is given. + + Parameters + ---------- + data_size : int + The length that the input should have. + array : array-like, shape (`data_size`), optional + The array to validate. Default is None, which will create an array of ones with length + equal to `data_size`. + copy_input : bool, optional + If True, returns a copy of the input `array` if it is not None. Default is False. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + check_finite : bool, optional + If True, will raise an error if any values if `array` are not finite. Default is False, + which skips the check. + name : str, optional + The name for the variable if an exception is raised. Default is 'weights'. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + + Returns + ------- + output_array : numpy.ndarray, shape (`data_size`) + The validated array or the new ones array. + + """ + if array is None: + output_array = np.ones(data_size) + else: + output_array = _check_sized_array( + array, data_size, dtype=dtype, order=order, check_finite=check_finite, + name=name, ensure_1d=ensure_1d, axis=slice(None) # TODO change axis later + ) + if copy_input: + output_array = output_array.copy() + + return output_array diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index d40909d..3bdb6a5 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -7,11 +7,12 @@ """ from .morphological import _Morphological +from .polynomial import _Polynomial from .smooth import _Smooth class Baseline2D( - _Morphological, _Smooth + _Morphological, _Polynomial, _Smooth ): """ A class for all 2D baseline correction algorithms. diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py new file mode 100644 index 0000000..f31a206 --- /dev/null +++ b/pybaselines/two_d/polynomial.py @@ -0,0 +1,886 @@ +# -*- coding: utf-8 -*- +"""Polynomial techniques for fitting baselines to experimental data. + +Created on April 16, 2023 +@author: Donald Erb + + +The function penalized_poly was adapted from MATLAB code from +https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction +(accessed March 18, 2021), which was licensed under the BSD-2-clause below. + +License: 2-clause BSD + +Copyright (c) 2012, Vincent Mazet +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the distribution + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + + +The function loess was adapted from code from https://gist.github.com/agramfort/850437 +(accessed March 25, 2021), which was licensed under the BSD-3-clause below. + +# Authors: Alexandre Gramfort +# +# License: BSD (3-clause) +Copyright (c) 2015, Alexandre Gramfort +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +""" + +import numpy as np + +from .. import _weighting +from ._algorithm_setup import _Algorithm2D +from ..utils import ( + _MIN_FLOAT, _convert_coef, relative_difference +) + + +class _Polynomial(_Algorithm2D): + """A base class for all polynomial algorithms.""" + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def poly(self, data, poly_order=2, weights=None, return_coef=False): + """ + Computes a polynomial that fits the baseline of the data. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input x_data and return them in the params dictionary. + Default is False, since the conversion takes time. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'coef': numpy.ndarray, shape (poly_order,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Notes + ----- + To only fit regions without peaks, supply a weight array with zero values + at the indices where peaks are located. + + """ + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True + ) + sqrt_w = np.sqrt(weight_array) + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + params = {'weights': weight_array} + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=False, return_coef=False): + """ + The modified polynomial (ModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 with N points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [8]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [9]_. + mask_initial_peaks : bool, optional + If True, will mask any data where the initial baseline fit + the standard + deviation of the residual is less than measured data [10]_. Default is False. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input x_data and return them in the params dictionary. + Default is False, since the conversion takes time. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (poly_order + 1,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Notes + ----- + Algorithm originally developed in [9]_ and then slightly modified in [8]_. + + References + ---------- + .. [8] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [9] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [10] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + ) + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + if mask_initial_peaks: + # use baseline + deviation since without deviation, half of y should be above baseline + weight_array[baseline + np.std(y - baseline) < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + y = np.minimum(y0 if use_original else y, baseline) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1.): + """ + The improved modofied polynomial (IModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [11]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [12]_. + mask_initial_peaks : bool, optional + If True (default), will mask any data where the initial baseline fit + + the standard deviation of the residual is less than measured data [13]_. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input x_data and return them in the params dictionary. + Default is False, since the conversion takes time. + num_std : float, optional + The number of standard deviations to include when thresholding. Default + is 1. Must be greater or equal to 0. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (poly_order + 1,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Raises + ------ + ValueError + Raised if `num_std` is less than 0. + + Notes + ----- + Algorithm originally developed in [13]_. + + References + ---------- + .. [11] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [12] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [13] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + if num_std < 0: + raise ValueError('num_std must be greater than or equal to 0') + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + ) + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + deviation = np.std(y - baseline) + if mask_initial_peaks: + weight_array[baseline + deviation < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + y = np.minimum(y0 if use_original else y, baseline + num_std * deviation) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + new_deviation = np.std(y - baseline) + # use new_deviation as dividing term in relative difference + calc_difference = relative_difference(new_deviation, deviation) + tol_history[i] = calc_difference + if calc_difference < tol: + break + deviation = new_deviation + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + # adapted from + # https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction; + # see license above + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + cost_function='asymmetric_truncated_quadratic', threshold=None, + alpha_factor=0.99, return_coef=False): + """ + Fits a polynomial baseline using a non-quadratic cost function. + + The non-quadratic cost functions penalize residuals with larger values, + giving a more robust fit compared to normal least-squares. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + cost_function : str, optional + The non-quadratic cost function to minimize. Must indicate symmetry of the + method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or + 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. + Available methods, and their associated reference, are: + + * 'asymmetric_truncated_quadratic'[14]_ + * 'symmetric_truncated_quadratic'[14]_ + * 'asymmetric_huber'[14]_ + * 'symmetric_huber'[14]_ + * 'asymmetric_indec'[15]_ + * 'symmetric_indec'[15]_ + + threshold : float, optional + The threshold value for the loss method, where the function goes from + quadratic loss (such as used for least squares) to non-quadratic. For + symmetric loss methods, residual values with absolute value less than + threshold will have quadratic loss. For asymmetric loss methods, residual + values less than the threshold will have quadratic loss. Default is None, + which sets `threshold` to one-tenth of the standard deviation of the input + data. + alpha_factor : float, optional + A value between 0 and 1 that controls the value of the penalty. Default is + 0.99. Typically should not need to change this value. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input x_data and return them in the params dictionary. + Default is False, since the conversion takes time. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (poly_order + 1,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Raises + ------ + ValueError + Raised if `alpha_factor` is not between 0 and 1. + + Notes + ----- + In baseline literature, this procedure is sometimes called "backcor". + + References + ---------- + .. [14] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + .. [15] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + """ + if not 0 < alpha_factor <= 1: + raise ValueError('alpha_factor must be between 0 and 1') + symmetric_loss, method = _identify_loss_method(cost_function) + loss_function = { + 'huber': _huber_loss, + 'truncated_quadratic': _truncated_quadratic_loss, + 'indec': _indec_loss + }[method] + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True + ) + if threshold is None: + threshold = np.std(y) / 10 + loss_kwargs = { + 'threshold': threshold, 'alpha_factor': alpha_factor, 'symmetric': symmetric_loss + } + + sqrt_w = np.sqrt(weight_array) + y = sqrt_w * y + + coef = pseudo_inverse @ y + baseline = self.vandermonde @ coef + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + coef = pseudo_inverse @ (y + loss_function(y - sqrt_w * baseline, **loss_kwargs)) + baseline = self.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + cost_function='asymmetric_indec', peak_ratio=0.5, alpha_factor=0.99, + tol_2=1e-3, tol_3=1e-6, max_iter_2=100, return_coef=False): + """ + Fits a polynomial baseline using a non-quadratic cost function. + + The non-quadratic cost functions penalize residuals with larger values, + giving a more robust fit compared to normal least-squares. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + tol : float, optional + The exit criteria for the fitting with a given threshold value. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations for fitting a threshold value. Default is 250. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + cost_function : str, optional + The non-quadratic cost function to minimize. Unlike :func:`.penalized_poly`, + this function only works with asymmetric cost functions, so the symmetry prefix + ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default + is 'asymmetric_indec'. Available methods, and their associated reference, are: + + * 'asymmetric_indec'[25]_ + * 'asymmetric_truncated_quadratic'[26]_ + * 'asymmetric_huber'[26]_ + + peak_ratio : float, optional + A value between 0 and 1 that designates how many points in the data belong + to peaks. Values are valid within ~10% of the actual peak ratio. Default is 0.5. + alpha_factor : float, optional + A value between 0 and 1 that controls the value of the penalty. Default is + 0.99. Typically should not need to change this value. + tol_2 : float, optional + The exit criteria for the difference between the optimal up-down ratio (number of + points above 0 in the residual compared to number of points below 0) and the up-down + ratio for a given threshold value. Default is 1e-3. + tol_3 : float, optional + The exit criteria for the relative change in the threshold value. Default is 1e-6. + max_iter_2 : float, optional + The number of iterations for iterating between different threshold values. + Default is 100. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input x_data and return them in the params dictionary. + Default is False, since the conversion takes time. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray, shape (J, K) + An array containing the calculated tolerance values for each iteration + of both threshold values and fit values. Index 0 are the tolerence values + for the difference in up-down ratios, index 1 are the tolerance values for + the relative change in the threshold, and indices >= 2 are the tolerance values + for each fit. All values that were not used in fitting have values of 0. Shape J + is 2 plus the number of iterations for the threshold to converge (related to + `max_iter_2`, `tol_2`, `tol_3`), and shape K is the maximum of the number of + iterations for the threshold and the maximum number of iterations for all of + the fits of the various threshold values (related to `max_iter` and `tol`). + * 'threshold' : float + The optimal threshold value. Could be used in :func:`.penalized_poly` + for fitting other similar data. + * 'coef': numpy.ndarray, shape (poly_order + 1,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Raises + ------ + ValueError + Raised if `alpha_factor` or `peak_ratio` are not between 0 and 1, or if the + specified cost function is symmetric. + + References + ---------- + .. [25] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + .. [26] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + if not 0 < alpha_factor <= 1: + raise ValueError('alpha_factor must be between 0 and 1') + elif not 0 < peak_ratio < 1: + raise ValueError('peak_ratio must be between 0 and 1') + try: + symmetric_loss, method = _identify_loss_method(cost_function) + except ValueError: # do not require a prefix since cost must be asymmetric + symmetric_loss, method = _identify_loss_method('a_' + cost_function) + if symmetric_loss: + # symmetric cost functions don't work due to how the up-down ratio vs + # peak_ratio function was created in the reference; in theory, could simulate + # spectra with both positive and negative peaks following the reference + # and build another empirical function, but would likely need to also + # add other parameters detailing the percent of positive vs negative peaks, + # etc., so it's not worth the effort + raise ValueError('goldindec only works for asymmetric cost functions') + + loss_function = { + 'huber': _huber_loss, + 'truncated_quadratic': _truncated_quadratic_loss, + 'indec': _indec_loss + }[method] + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True + ) + up_down_ratio_goal = ( + 0.7679 + 11.2358 * peak_ratio - 39.7064 * peak_ratio**2 + 92.3583 * peak_ratio**3 + ) + # TODO reference states threshold must be <= 2 for half-quadratic minimization to + # be valid for indec cost function, and normalized y so that threshold is always <= 2; + # however, it seems to work fine without normalization; just be aware in case errors + # occur, may have to normalize y in both this function and penalized_poly + sqrt_w = np.sqrt(weight_array) + y_fit = sqrt_w * y + + coef = pseudo_inverse @ y_fit + initial_baseline = self.vandermonde @ coef + + a = 0 + # reference used b=1, but normalized y before fitting; instead, set b as max of + # initial residual + b = abs((y - initial_baseline).max()) + threshold = a + 0.618 * (b - a) + loss_kwargs = { + 'threshold': threshold, 'alpha_factor': alpha_factor, + 'symmetric': symmetric_loss + } + # have to use zeros rather than empty for tol_history since each inner fit may + # have a different number of iterations + tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2))) + j_max = 0 + for i in range(max_iter_2): + baseline = initial_baseline + for j in range(max_iter): + baseline_old = baseline + coef = pseudo_inverse @ ( + y_fit + loss_function(y_fit - sqrt_w * baseline, **loss_kwargs) + ) + baseline = self.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i + 2, j] = calc_difference + if calc_difference < tol: + break + if j > j_max: + j_max = j + + up_count = (y > baseline).sum() + up_down_ratio = up_count / max(1, self._len - up_count) + calc_difference = up_down_ratio - up_down_ratio_goal + tol_history[0, i] = calc_difference + if calc_difference > tol_2: + a = threshold + elif calc_difference < -tol_2: + b = threshold + else: + break + threshold = a + 0.618 * (b - a) + # this exit criteria was not stated in the reference, but the change in threshold + # becomes zero fairly quickly, so need to also exit rather than needlessly + # continuing to calculate with the same threshold value + calc_difference = relative_difference(loss_kwargs['threshold'], threshold) + tol_history[1, i] = calc_difference + if calc_difference < tol_3: + break + loss_kwargs['threshold'] = threshold + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 3, :max(i, j_max) + 1], + 'threshold': loss_kwargs['threshold'] + } + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _huber_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Huber non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the huber loss function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for huber is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * 2 * alpha * threshold * np.sign(residual) + ) + else: + mask = (residual < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * (2 * alpha * threshold - residual) + ) + return weights + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _truncated_quadratic_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Truncated-Quadratic non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the truncated quadratic function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for truncated quadratic is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + else: + mask = (residual < threshold) + return mask * residual * (2 * alpha - 1) - (~mask) * residual + + +def _indec_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Indec non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the Indec function, phi(x). + + References + ---------- + Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for indec is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + multiple = np.sign(residual) + else: + mask = (residual < threshold) + # multiple=1 is same as sign(residual) since residual is always > 0 + # for asymmetric case, but this allows not doing the sign calculation + multiple = 1 + weights = ( + mask * residual * (2 * alpha - 1) + - (~mask) * ( + residual + alpha * multiple * threshold**3 / np.maximum(2 * residual**2, _MIN_FLOAT) + ) + ) + return weights + + +def _identify_loss_method(loss_method): + """ + Identifies the symmetry for the given loss method. + + Parameters + ---------- + loss_method : str + The loss method to use. Should have the symmetry identifier as + the prefix. + + Returns + ------- + symmetric : bool + True if `loss_method` had 's_' or 'symmetric_' as the prefix, else False. + str + The input `loss_method` value without the first section that indicated + the symmetry. + + Raises + ------ + ValueError + Raised if the loss method does not have the correct form. + + """ + prefix, *split_method = loss_method.lower().split('_') + if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: + raise ValueError('must specify loss function symmetry by appending "a_" or "s_"') + if prefix in ('a', 'asymmetric'): + symmetric = False + else: + symmetric = True + return symmetric, '_'.join(split_method) From 8031abc73096ea8b83f3b7012d75b77717ba19b4 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 22 Apr 2023 10:12:42 -0400 Subject: [PATCH 03/56] FEAT: Added 2D version of quant_reg --- pybaselines/two_d/_algorithm_setup.py | 4 +- pybaselines/two_d/polynomial.py | 104 ++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 13211b0..8888f67 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -363,7 +363,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, elif poly_order < self.poly_order: pass #self.vandermonde = self.vandermonde[:, :poly_order + 1] self.poly_order = poly_order - + y = y.ravel() if not calc_pinv: return y, weight_array elif not calc_vander: @@ -374,7 +374,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, else: pseudo_inverse = np.linalg.pinv(np.sqrt(weight_array)[:, None] * self.vandermonde) - return y.ravel(), weight_array, pseudo_inverse + return y, weight_array, pseudo_inverse def _setup_morphology(self, y, half_window=None, **window_kwargs): """ diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index f31a206..6e3d535 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -490,6 +490,110 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non return baseline, params + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, + weights=None, eps=None, return_coef=False): + """ + Approximates the baseline of the data using quantile regression. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int, optional + The polynomial order for fitting the baseline. Default is 2. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + tol : float, optional + The exit criteria. Default is 1e-6. For extreme quantiles (`quantile` < 0.01 + or `quantile` > 0.99), may need to use a lower value to get a good fit. + max_iter : int, optional + The maximum number of iterations. Default is 250. For extreme quantiles + (`quantile` < 0.01 or `quantile` > 0.99), may need to use a higher value to + ensure convergence. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the input `x_data` and return them in the params dictionary. + Default is False, since the conversion takes time. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (poly_order + 1,) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using numpy.polynomial.polynomial.Polynomial(). + + Raises + ------ + ValueError + Raised if `quantile` is not between 0 and 1. + + Notes + ----- + Application of quantile regression for baseline fitting ss described in [23]_. + + Performs quantile regression using iteratively reweighted least squares (IRLS) + as described in [24]_. + + References + ---------- + .. [23] Komsta, Ł. Comparison of Several Methods of Chromatographic + Baseline Removal with a New Approach Based on Quantile Regression. + Chromatographia, 2011, 73, 721-731. + .. [24] Schnabel, S., et al. Simultaneous estimation of quantile curves using + quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. + + """ + # TODO provide a way to estimate best poly_order based on AIC like in Komsta? could be + # useful for all polynomial methods; maybe could be an optimizer function + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1.') + + y, weight_array = self._setup_polynomial(data, weights, poly_order, calc_vander=True) + # estimate first iteration using least squares + sqrt_w = np.sqrt(weight_array) + coef = np.linalg.lstsq(self.vandermonde * sqrt_w[:, None], y * sqrt_w, None)[0] + baseline = self.vandermonde @ coef + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + sqrt_w = np.sqrt(_weighting._quantile(y, baseline, quantile, eps)) + coef = np.linalg.lstsq(self.vandermonde * sqrt_w[:, None], y * sqrt_w, None)[0] + baseline = self.vandermonde @ coef + # relative_difference(baseline_old, baseline, 1) gives nearly same result and + # the l2 norm is faster to calculate, so use that instead of l1 norm + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + @_Algorithm2D._register( sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) ) From 8de57b023fd3ffaf7046c61ba11622fbc24f503f Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 25 Apr 2023 20:19:32 -0400 Subject: [PATCH 04/56] FEAT: Yeehaw finally figured out 2D penalized splines Someone should give Paul Eilers the Nobel prize, dude is the GOAT. On a more serious note, the internals of the PSpline2D class will most likely change, but the external calls within the baseline algorithms should remain the same. Implemented the 2D versions of irsqr, pspline_asls, pspline_airpls, pspline_arpls, pspline_iarpls, and pspline_psalsa. --- pybaselines/two_d/_algorithm_setup.py | 91 +++- pybaselines/two_d/_spline_utils.py | 217 ++++++++++ pybaselines/two_d/api.py | 5 +- pybaselines/two_d/spline.py | 570 ++++++++++++++++++++++++++ 4 files changed, 877 insertions(+), 6 deletions(-) create mode 100644 pybaselines/two_d/_spline_utils.py create mode 100644 pybaselines/two_d/spline.py diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 8888f67..2a3626b 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -8,14 +8,14 @@ from contextlib import contextmanager from functools import partial, wraps +import warnings import numpy as np from scipy.ndimage import grey_opening -from ._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays -) -from ..utils import _inverted_sort, pad_edges, relative_difference +from ..utils import ParameterWarning, _inverted_sort, pad_edges, relative_difference +from ._spline_utils import PSpline2D +from ._validation import _check_array, _check_half_window, _check_optional_array, _check_scalar class _Algorithm2D: @@ -376,6 +376,89 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, return y, weight_array, pseudo_inverse + def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, + penalized=True, diff_order=3, lam=1, make_basis=True, allow_lower=True, + reverse_diags=False, copy_weights=False): + """ + Sets the starting parameters for doing spline fitting. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + num_knots : int, optional + The number of interior knots for the splines. Default is 10. + penalized : bool, optional + Whether the basis matrix should be for a penalized spline or a regular + B-spline. Default is True, which creates the basis for a penalized spline. + diff_order : int, optional + The integer differential order for the spline penalty; must be greater than 0. + Default is 3. Only used if `penalized` is True. + lam : float, optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on the number of knots and the difference order. + Default is 1. + make_basis : bool, optional + If True (default), will create the matrix containing the spline basis functions. + allow_lower : boolean, optional + If True (default), will include only the lower non-zero diagonals of + the squared difference matrix. If False, will include all non-zero diagonals. + reverse_diags : boolean, optional + If True, will reverse the order of the diagonals of the penalty matrix. + Default is False. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,) + The weight array for fitting the spline to the data. + + Warns + ----- + ParameterWarning + Raised if `diff_order` is greater than 4. + + Notes + ----- + `degree` is used instead of `order` like for polynomials since the order of a spline + is defined by convention as ``degree + 1``. + + """ + weight_array = _check_optional_array( + y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape + ) + weight_array = weight_array.ravel() + # TODO + #if self._sort_order is not None and weights is not None: + # weight_array = weight_array[self._sort_order] + diff_order = _check_scalar(diff_order, 2, True)[0] + if make_basis: + if (diff_order > 4).any(): + warnings.warn( + ('differential orders greater than 4 can have numerical issues;' + ' consider using a differential order of 2 or 3 instead'), + ParameterWarning, stacklevel=2 + ) + + if self.pspline is None or not self.pspline.same_basis(num_knots, spline_degree): + self.pspline = PSpline2D( + self.x, self.z, num_knots, spline_degree, self._check_finite, lam, diff_order + ) + else: + self.pspline.reset_penalty_diagonals(lam, diff_order) + + return y.ravel(), weight_array + def _setup_morphology(self, y, half_window=None, **window_kwargs): """ Sets the starting parameters for morphology-based methods. diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py new file mode 100644 index 0000000..6242856 --- /dev/null +++ b/pybaselines/two_d/_spline_utils.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +"""Helper functions for using splines. + +Created on April 25, 2023 +@author: Donald Erb + +""" + +import numpy as np +from scipy import sparse +from scipy.sparse.linalg import spsolve + +from .._banded_utils import difference_matrix +from .._spline_utils import _spline_basis, _spline_knots +from ._validation import _check_array, _check_lam, _check_scalar + + +class PSpline2D: + """ + A Penalized Spline, which penalizes the difference of the spline coefficients. + + Penalized splines (P-Splines) are solved with the following equation + ``(B.T @ W @ B + P) c = B.T @ W @ y`` where `c` is the spline coefficients, `B` is the + spline basis, the weights are the diagonal of `W`, the penalty is `P`, and `y` is the + fit data. The penalty `P` is usually in the form ``lam * D.T @ D``, where `lam` is a + penalty factor and `D` is the matrix version of the finite difference operator. + + Attributes + ---------- + basis : scipy.sparse.csr.csr_matrix, shape (N, M) + The spline basis. Has a shape of (`N,` `M`), where `N` is the number of points + in `x`, and `M` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots + spline_degree - 1``). + coef : None or numpy.ndarray, shape (M,) + The spline coefficients. Is None if :meth:`.solve_pspline` has not been called + at least once. + knots : numpy.ndarray, shape (K,) + The knots for the spline. Has a shape of `K`, which is equal to + ``num_knots + 2 * spline_degree``. + num_knots : int + The number of internal knots (including the endpoints). The total number of knots + for the spline, `K`, is equal to ``num_knots + 2 * spline_degree``. + spline_degree : int + The degree of the spline (eg. a cubic spline would have a `spline_degree` of 3). + x : numpy.ndarray, shape (N,) + The x-values for the spline. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + + def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam=1, + diff_order=2): + """ + Initializes the penalized spline by calculating the basis and penalty. + + Parameters + ---------- + x : array-like, shape (N,) + The x-values for the spline. + z : array-like, shape (L,) + The z-values for the spline. + num_knots : int or Sequence(int, int), optional + The number of internal knots for the spline, including the endpoints. + Default is 100. + spline_degree : int or Sequence(int, int), optional + The degree of the spline. Default is 3, which is a cubic spline. + check_finite : bool, optional + If True, will raise an error if any values in `x` are not finite. Default + is False, which skips the check. + lam : float or Sequence(float, float), optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence(int, int), optional + The difference order of the penalty. Default is 2 (second order difference). + + Raises + ------ + ValueError + Raised if `spline_degree` is less than 0 or if `diff_order` is less than 1 + or greater than or equal to the number of spline basis functions + (``num_knots + spline_degree - 1``). + + """ + self.x = _check_array(x, dtype=float, check_finite=check_finite, ensure_1d=True) + self.z = _check_array(z, dtype=float, check_finite=check_finite, ensure_1d=True) + self.shape = (len(x), len(z)) + + self.num_knots = _check_scalar(num_knots, 2, True)[0] + self.diff_order = _check_scalar(diff_order, 2, True)[0] + self.spline_degree = _check_scalar(spline_degree, 2, True)[0] + self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + + self.knots_1 = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) + self.basis_1 = _spline_basis(self.x, self.knots_1, self.spline_degree[0]) + + self.knots_2 = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) + self.basis_2 = _spline_basis(self.z, self.knots_2, self.spline_degree[1]) + self._num_bases = np.array([self.basis_1.shape[1], self.basis_2.shape[1]]) + + self.basis = sparse.kron(self.basis_2, self.basis_1) + self.coef = None + + D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) + D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) + + P1 = self.lam[0] * sparse.kron(D1.T @ D1, sparse.identity(self._num_bases[1])) + P2 = self.lam[1] * sparse.kron(sparse.identity(self._num_bases[0]), D2.T @ D2) + self.penalty = P1 + P2 + + if (self.diff_order >= self._num_bases).any(): + raise ValueError(( + 'the difference order must be less than the number of basis ' + 'functions, which is the number of knots + spline degree - 1' + )) + elif (self.spline_degree < 0).any(): + raise ValueError('spline degree must be greater than or equal to 0') + + def same_basis(self, num_knots=100, spline_degree=3): + """ + Sees if the current basis is equivalent to the input number of knots of spline degree. + + Parameters + ---------- + num_knots : int, optional + The number of knots for the new spline. Default is 100. + spline_degree : int, optional + The degree of the new spline. Default is 3. + + Returns + ------- + bool + True if the input number of knots and spline degree are equivalent to the current + spline basis of the object. + + """ + return False # TODO will need to check both basis matrices + + def reset_penalty_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=False): + """ + Resets the penalty diagonals of the system and all of the attributes. + + Useful for reusing the penalty diagonals without having to recalculate the spline basis. + + Parameters + ---------- + lam : float, optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int, optional + The difference order of the penalty. Default is 2 (second order difference). + allow_lower : bool, optional + If True (default), will allow only using the lower bands of the penalty matrix, + which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly + slower :func:`scipy.linalg.solve_banded`. + reverse_diags : bool, optional + If True, will reverse the order of the diagonals of the squared difference + matrix. If False (default), will never reverse the diagonals. + + Notes + ----- + `allow_pentapy` is always set to False since the time needed to go from a lower to full + banded matrix and shifting the rows removes any speedup from using pentapy's solver. It + also reduces the complexity of setting up the equations. + + Adds padding to the penalty diagonals to accomodate the different shapes of the spline + basis and the penalty to speed up calculations when the two are added. + + """ + + def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): + """ + Solves the coefficients for a weighted penalized spline. + + Solves the linear equation ``(B.T @ W @ B + P) c = B.T @ W @ y`` for the spline + coefficients, `c`, given the spline basis, `B`, the weights (diagonal of `W`), the + penalty `P`, and `y`, and returns the resulting spline, ``B @ c``. Attempts to + calculate ``B.T @ W @ B`` and ``B.T @ W @ y`` as a banded system to speed up + the calculation. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values for fitting the spline. + weights : numpy.ndarray, shape (N,) + The weights for each y-value. + penalty : numpy.ndarray, shape (D, N) + The finite difference penalty matrix, in LAPACK's lower banded format (see + :func:`scipy.linalg.solveh_banded`) if `lower_only` is True or the full banded + format (see :func:`scipy.linalg.solve_banded`) if `lower_only` is False. + rhs_extra : float or numpy.ndarray, shape (N,), optional + If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (N,) + The spline, corresponding to ``B @ c``, where `c` are the solved spline + coefficients and `B` is the spline basis. + + """ + # TODO investigate whether the other algorithm in Eilers's paper is more efficient + # memory- or time-wise + CWT = self.basis.multiply( + np.repeat( + weights, self._num_bases[0] * self._num_bases[1] + ).reshape(self.shape[0] * self.shape[1], -1) + ).T + CWC = CWT @ self.basis + CWy = CWT @ y + + self.coef = spsolve(CWC + self.penalty, CWy) + + return self.basis @ self.coef diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index 3bdb6a5..5085ba5 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -9,10 +9,11 @@ from .morphological import _Morphological from .polynomial import _Polynomial from .smooth import _Smooth +from .spline import _Spline class Baseline2D( - _Morphological, _Polynomial, _Smooth + _Morphological, _Polynomial, _Smooth, _Spline ): """ A class for all 2D baseline correction algorithms. @@ -26,7 +27,7 @@ class Baseline2D( The x-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. - z_data : array-like, shape (N,), optional + z_data : array-like, shape (L,), optional The z-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py new file mode 100644 index 0000000..0c10fbd --- /dev/null +++ b/pybaselines/two_d/spline.py @@ -0,0 +1,570 @@ +# -*- coding: utf-8 -*- +"""Functions for fitting baselines using splines. + +Created on April 25, 2023 +@author: Donald Erb + +""" + +import warnings + +import numpy as np + +from .. import _weighting +from ..utils import ParameterWarning, relative_difference +from ._algorithm_setup import _Algorithm2D + + +class _Spline(_Algorithm2D): + """A base class for all spline algorithms.""" + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, + diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): + """ + Iterative Reweighted Spline Quantile Regression (IRSQR). + + Fits the baseline using quantile regression with penalized splines. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 3 + (third order differential matrix). Typical values are 3, 2, or 1. + max_iter : int, optional + The max number of fit iterations. Default is 100. + tol : float, optional + The exit criteria. Default is 1e-6. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if quantile is not between 0 and 1. + + References + ---------- + Han, Q., et al. Iterative Reweighted Quantile Regression Using Augmented Lagrangian + Optimization for Baseline Correction. 2018 5th International Conference on Information + Science and Control Engineering (ICISCE), 2018, 280-284. + + """ + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + old_coef = np.zeros(self.pspline._num_bases[0] * self.pspline._num_bases[1]) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve_pspline(y, weight_array) + calc_difference = relative_difference(old_coef, self.pspline.coef) + tol_history[i] = calc_difference + if calc_difference < tol: + break + old_coef = self.pspline.coef + weight_array = _weighting._quantile(y, baseline, quantile, eps) + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the asymmetric least squares (AsLS) algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 with N points. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + See Also + -------- + pybaselines.whittaker.asls + + References + ---------- + Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636. + + Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. + Leiden University Medical Centre Report, 2005, 1(1). + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve_pspline(y, weight_array) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, + diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the airPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + pybaselines.whittaker.airpls + + References + ---------- + Zhang, Z.M., et al. Baseline correction using adaptive iteratively + reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam, copy_weights=True + ) + + y_l1_norm = np.abs(y).sum() + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + try: + output = self.pspline.solve_pspline(y, weight_array) + except np.linalg.LinAlgError: + warnings.warn( + ('error occurred during fitting, indicating that "tol"' + ' is too low, "max_iter" is too high, or "lam" is too high'), + ParameterWarning + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + else: + baseline = output + + residual = y - baseline + neg_mask = residual < 0 + neg_residual = residual[neg_mask] + if len(neg_residual) < 2: + # exit if there are < 2 negative residuals since all points or all but one + # point would get a weight of 0, which fails the solver + warnings.warn( + ('almost all baseline points are below the data, indicating that "tol"' + ' is too low and/or "max_iter" is too high'), ParameterWarning + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + + residual_l1_norm = abs(neg_residual.sum()) + calc_difference = residual_l1_norm / y_l1_norm + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + # only use negative residual in exp to avoid exponential overflow warnings + # and accidently creating a weight of nan (inf * 0 = nan) + weight_array[neg_mask] = np.exp(i * neg_residual / residual_l1_norm) + weight_array[~neg_mask] = 0 + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the arPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + pybaselines.whittaker.arpls + + References + ---------- + Baek, S.J., et al. Baseline correction using asymmetrically reweighted + penalized least squares smoothing. Analyst, 2015, 140, 250-257. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve_pspline(y, weight_array) + new_weights = _weighting._arpls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the IarPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 with N points. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + pybaselines.whittaker.iarpls + + References + ---------- + Ye, J., et al. Baseline correction method based on improved asymmetrically + reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, + 59, 10933-10943. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = self.pspline.solve_pspline(y, weight_array) + new_weights = _weighting._iarpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degree=3, + diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the psalsa algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 0.5. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`.asls`. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + See Also + -------- + pybaselines.whittaker.psalsa + + References + ---------- + Oller-Moreno, S., et al. Adaptive Asymmetric Least Squares baseline estimation + for analytical instruments. 2014 IEEE 11th International Multi-Conference on + Systems, Signals, and Devices, 2014, 1-5. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + if k is None: + k = np.std(y) / 10 + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve_pspline(y, weight_array) + new_weights = _weighting._psalsa(y, baseline, p, k, len(y)) # TODO replace len(y) with self._shape or whatever + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params From e82cbb23d9bc312444be2c9002d538d922427153 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Wed, 26 Apr 2023 21:00:37 -0400 Subject: [PATCH 05/56] OTHER: Use a more efficient method for 2D psplines Switched to Eilers's alternate method of solving psplines, which is more memory efficient as long as the number of knots is relatively low; also about 50-90% faster than the previous method. --- pybaselines/two_d/_algorithm_setup.py | 4 +-- pybaselines/two_d/_spline_utils.py | 45 ++++++++++++++++++--------- pybaselines/two_d/spline.py | 26 +++++----------- 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 2a3626b..377e620 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -437,7 +437,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, weight_array = _check_optional_array( y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape ) - weight_array = weight_array.ravel() + weight_array = weight_array # TODO #if self._sort_order is not None and weights is not None: # weight_array = weight_array[self._sort_order] @@ -457,7 +457,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, else: self.pspline.reset_penalty_diagonals(lam, diff_order) - return y.ravel(), weight_array + return y, weight_array def _setup_morphology(self, y, half_window=None, **window_kwargs): """ diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 6242856..7c1ab77 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -101,7 +101,11 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam self.basis_2 = _spline_basis(self.z, self.knots_2, self.spline_degree[1]) self._num_bases = np.array([self.basis_1.shape[1], self.basis_2.shape[1]]) - self.basis = sparse.kron(self.basis_2, self.basis_1) + el = np.ones((self._num_bases[0], 1)) + ek = np.ones((self._num_bases[1], 1)) + self._G = sparse.kron(self.basis_1, el.T).multiply(sparse.kron(el.T, self.basis_1)) + self._G2 = sparse.kron(self.basis_2, ek.T).multiply(sparse.kron(ek.T, self.basis_2)) + self.coef = None D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) @@ -201,17 +205,30 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): The spline, corresponding to ``B @ c``, where `c` are the solved spline coefficients and `B` is the spline basis. + Notes + ----- + Uses the more efficient algorithm from Eilers's paper, although the memory usage + is higher than the straigtforward method when the number of knots is high; however, + it is significantly faster and memory efficient when the number of knots is lower, + which will be the more typical use case. + """ - # TODO investigate whether the other algorithm in Eilers's paper is more efficient - # memory- or time-wise - CWT = self.basis.multiply( - np.repeat( - weights, self._num_bases[0] * self._num_bases[1] - ).reshape(self.shape[0] * self.shape[1], -1) - ).T - CWC = CWT @ self.basis - CWy = CWT @ y - - self.coef = spsolve(CWC + self.penalty, CWy) - - return self.basis @ self.coef + # do not save intermediate results since they are memory intensive for high number of knots + F = np.transpose( + (self._G2.T @ weights @ self._G).reshape( + (self._num_bases[1], self._num_bases[1], self._num_bases[0], self._num_bases[0]) + ), + [0, 2, 1, 3] + ).reshape( + (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + ) + + self.coef = spsolve( + sparse.csr_matrix(F) + self.penalty, + (self.basis_2.T @ (weights * y) @ self.basis_1).flatten(), + 'NATURAL' + ).reshape(self._num_bases[1], self._num_bases[0]) + + output = self.basis_2 @ self.coef @ self.basis_1.T + + return output diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 0c10fbd..7f1aa7e 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -18,9 +18,7 @@ class _Spline(_Algorithm2D): """A base class for all spline algorithms.""" - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): """ @@ -105,9 +103,7 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -198,9 +194,7 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -304,9 +298,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -380,9 +372,7 @@ def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -472,9 +462,7 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -558,7 +546,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.pspline.solve_pspline(y, weight_array) - new_weights = _weighting._psalsa(y, baseline, p, k, len(y)) # TODO replace len(y) with self._shape or whatever + new_weights = _weighting._psalsa(y, baseline, p, k, y.shape) # TODO replace y.shape with self._shape or whatever calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference if calc_difference < tol: From e8a3bf92bd51c60ac08a176d1970ed6c8a091682 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Fri, 28 Apr 2023 20:46:39 -0400 Subject: [PATCH 06/56] FEAT: Implemented 2D version of mixture_model --- pybaselines/two_d/spline.py | 338 +++++++++++++++++++++++++++++++++++- 1 file changed, 337 insertions(+), 1 deletion(-) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 7f1aa7e..8e0137b 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -6,18 +6,199 @@ """ +from functools import partial +from math import ceil import warnings import numpy as np +from scipy.optimize import curve_fit from .. import _weighting -from ..utils import ParameterWarning, relative_difference +from ..utils import ParameterWarning, gaussian, relative_difference, _MIN_FLOAT from ._algorithm_setup import _Algorithm2D +from .._compat import _HAS_NUMBA, jit class _Spline(_Algorithm2D): """A base class for all spline algorithms.""" + @_Algorithm2D._register(sort_keys=('weights',)) + def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=3, + max_iter=50, tol=1e-3, weights=None, symmetric=False, num_bins=None): + """ + Considers the data as a mixture model composed of noise and peaks. + + Weights are iteratively assigned by calculating the probability each value in + the residual belongs to a normal distribution representing the noise. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e5. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Used to set the initial weights before performing + expectation-maximization. Default is 1e-2. + num_knots : int, optional + The number of knots for the spline. Default is 100. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 3 + (third order differential matrix). Typical values are 2 or 3. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1, and then + two iterations of reweighted least-squares are performed to provide starting + weights for the expectation-maximization of the mixture model. + symmetric : bool, optional + If False (default), the total mixture model will be composed of one normal + distribution for the noise and one uniform distribution for positive non-noise + residuals. If True, an additional uniform distribution will be added to the + mixture model for negative non-noise residuals. Only need to set `symmetric` + to True when peaks are both positive and negative. + num_bins : int, optional + The number of bins to use when transforming the residuals into a probability + density distribution. Default is None, which uses ``ceil(sqrt(N))``. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if p is not between 0 and 1. + + References + ---------- + de Rooi, J., et al. Mixture models for baseline estimation. Chemometric and + Intelligent Laboratory Systems, 2012, 117, 56-60. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + # scale y between -1 and 1 so that the residual fit is more numerically stable + y_domain = np.polynomial.polyutils.getdomain(y.flatten()) + y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) + + if weights is not None: + baseline = self.pspline.solve_pspline(y, weight_array) + else: + # perform 2 iterations: first is a least-squares fit and second is initial + # reweighted fit; 2 fits are needed to get weights to have a decent starting + # distribution for the expectation-maximization + if symmetric and not 0.2 < p < 0.8: + # p values far away from 0.5 with symmetric=True give bad initial weights + # for the expectation maximization + warnings.warn( + 'should use a p value closer to 0.5 when symmetric is True', + ParameterWarning, stacklevel=2 + ) + for _ in range(2): + baseline = self.pspline.solve_pspline(y, weight_array) + weight_array = _weighting._asls(y, baseline, p) + + # now perform the expectation-maximization + # TODO not sure if there is a better way to do this than transforming + # the residual into a histogram, fitting the histogram, and then assigning + # weights based on the bins; actual expectation-maximization uses log(probability) + # directly estimates sigma from that, and then calculates the percentages, maybe + # that would be faster/more stable? + if num_bins is None: + num_bins = ceil(np.sqrt(y.size)) + + # uniform probability density distribution for positive residuals, constant + # from 0 to max(residual), and 0 for residuals < 0 + pos_uniform_pdf = np.empty(num_bins) + tol_history = np.empty(max_iter + 1) + residual = y - baseline + + # the 0.2 * std(residual) is an "okay" starting sigma estimate + fit_params = [0.5, np.log10(0.2 * np.std(residual))] + bounds = [[0, -np.inf], [1, np.inf]] + if symmetric: + fit_params.append(0.25) + bounds[0].append(0) + bounds[1].append(1) + # create a second uniform pdf for negative residual values + neg_uniform_pdf = np.empty(num_bins) + else: + neg_uniform_pdf = None + + # convert bounds to numpy array since curve_fit will use np.asarray each iteration + bounds = np.array(bounds) + for i in range(max_iter + 1): + residual_hist, bin_edges, bin_mapping = _mapped_histogram(residual, num_bins) + # average bin edges to get better x-values for fitting + bins = 0.5 * (bin_edges[:-1] + bin_edges[1:]) + pos_uniform_mask = bins < 0 + pos_uniform_pdf[~pos_uniform_mask] = 1 / max(abs(residual.max()), 1e-6) + pos_uniform_pdf[pos_uniform_mask] = 0 + if symmetric: + neg_uniform_mask = bins > 0 + neg_uniform_pdf[~neg_uniform_mask] = 1 / max(abs(residual.min()), 1e-6) + neg_uniform_pdf[neg_uniform_mask] = 0 + + fit_func = partial( + _mixture_pdf, pos_uniform=pos_uniform_pdf, neg_uniform=neg_uniform_pdf + ) + # use dogbox method since trf gives RuntimeWarnings from nans appearing + # somehow during optimization; trf is also prone to failure when symmetric=True + fit_params = curve_fit( + fit_func, bins, residual_hist, p0=fit_params, bounds=bounds, + check_finite=False, method='dogbox' + )[0] + sigma = 10**fit_params[1] + gaus_pdf = fit_params[0] * gaussian(bins, 1 / (sigma * np.sqrt(2 * np.pi)), 0, sigma) + posterior_prob = gaus_pdf / np.maximum(fit_func(bins, *fit_params), _MIN_FLOAT) + # need to clip since a bad initial start can erroneously set the sum of the fractions + # of each distribution to > 1 + np.clip(posterior_prob, 0, 1, out=posterior_prob) + new_weights = posterior_prob[bin_mapping].reshape(y.shape) # TODO replace with self._shape + + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + weight_array = new_weights + baseline = self.pspline.solve_pspline(y, weight_array) + residual = y - baseline + + # TODO could potentially return a BSpline object from scipy.interpolate + # using knots, spline degree, and coef, but would need to allow user to + # input the x-values for it to be useful + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + baseline = np.polynomial.polyutils.mapdomain(baseline, np.array([-1., 1.]), y_domain) + + return baseline, params + @_Algorithm2D._register(sort_keys=('weights',)) def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): @@ -556,3 +737,158 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} return baseline, params + + +@jit(nopython=True, cache=True) +def _numba_mapped_histogram(data, num_bins, histogram): + """ + Creates a normalized histogram of the data and a mapping of the indices, using one pass. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The data to be made into a histogram. + num_bins : int + The number of bins for the histogram. + histogram : numpy.ndarray + An array of zeros that will be modified inplace into the histogram. + + Returns + ------- + bins : numpy.ndarray, shape (`num_bins` + 1) + The bin edges for the histogram. Follows numpy's implementation such that + each bin is inclusive on the left edge and exclusive on the right edge, except + for the last bin which is inclusive on both edges. + bin_mapping : numpy.ndarray, shape (N,) + An array of integers that maps each item in `data` to its index within `histogram`. + + Notes + ----- + `histogram` is modified inplace and converted to a probability density function + (total area = 1) after the counting. + + """ + num_data = data.shape[0] + bins = np.linspace(data.min(), data.max(), num_bins + 1) + bin_mapping = np.empty(num_data, dtype=np.intp) + bin_frequency = num_bins / (bins[-1] - bins[0]) + bin_0 = bins[0] + last_index = num_bins - 1 + # TODO this seems like it would work in parallel, but it instead slows down + for i in range(num_data): + index = int((data[i] - bin_0) * bin_frequency) + if index == num_bins: + histogram[last_index] += 1 + bin_mapping[i] = last_index + else: + histogram[index] += 1 + bin_mapping[i] = index + + # normalize histogram such that area=1 so that it is a probability density function + histogram /= (num_data * (bins[1] - bins[0])) + + return bins, bin_mapping + + +def _mapped_histogram(data, num_bins): + """ + Creates a histogram of the data and a mapping of the indices. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The data to be made into a histogram. + num_bins : int + The number of bins for the histogram. + + Returns + ------- + histogram : numpy.ndarray, shape (`num_bins`) + The histogram of the data, normalized so that its area is 1. + bins : numpy.ndarray, shape (`num_bins` + 1) + The bin edges for the histogram. Follows numpy's implementation such that + each bin is inclusive on the left edge and exclusive on the right edge, except + for the last bin which is inclusive on both edges. + bin_mapping : numpy.ndarray, shape (N,) + An array of integers that maps each item in `data` to its index within `histogram`. + + Notes + ----- + If numba is installed, the histogram and bin mapping can both be created in + one pass, which is faster. + + """ + if _HAS_NUMBA: + # create zeros array outside of numba function since numba's implementation + # of np.zeros is much slower than numpy's (https://github.com/numba/numba/issues/7259) + histogram = np.zeros(num_bins) + bins, bin_mapping = _numba_mapped_histogram(data.flatten(), num_bins, histogram) + else: + histogram, bins = np.histogram(data, num_bins, density=True) + # leave out last bin edge to account for extra index; leave out first + # bin edge since np.searchsorted finds indices where bin[i-1] <= val < bin[i] + # while the desired indices are bin[i] <= val < bin[i + 1] + bin_mapping = np.searchsorted(bins[1:-1], data, 'right') + + return histogram, bins, bin_mapping + + +def _mixture_pdf(x, n, sigma, n_2=0, pos_uniform=None, neg_uniform=None): + """ + The probability density function of a Gaussian and one or two uniform distributions. + + Parameters + ---------- + x : numpy.ndarray, shape (N,) + The x-values of the distribution. + n : float + The fraction of the distribution belonging to the Gaussian. + sigma : float + Log10 of the standard deviation of the Gaussian distribution. + n_2 : float, optional + If `neg_uniform` or `pos_uniform` is None, then `n_2` is just an unused input. + Otherwise, it is the fraction of the distribution belonging to the positive + uniform distribution. Default is 0. + pos_uniform : numpy.ndarray, shape (N,), optional + The array of the positive uniform distributtion. Default is None. + neg_uniform : numpy.ndarray, shape (N,), optional + The array of the negative uniform distribution. Default is None. + + Returns + ------- + numpy.ndarray + The total probability density function for the mixture model. + + Notes + ----- + Defining `sigma` as ``log10(actual sigma)`` allows not bounding `sigma` during + optimization and allows it to more easily fit different scales. + + References + ---------- + de Rooi, J., et al. Mixture models for baseline estimation. Chemometric and + Intelligent Laboratory Systems, 2012, 117, 56-60. + + """ + # no error handling for if both pos_uniform and neg_uniform are None since this + # is an internal function + if neg_uniform is None: + n1 = n + n2 = 1 - n + n3 = 0 + neg_uniform = 0 + elif pos_uniform is None: # never actually used, but nice to have for the future + n1 = n + n2 = 0 + n3 = 1 - n + pos_uniform = 0 + else: + n1 = n + n2 = n_2 + n3 = 1 - n - n_2 + + actual_sigma = 10**sigma + # the gaussian should be area-normalized, so set height accordingly + height = 1 / max(actual_sigma * np.sqrt(2 * np.pi), _MIN_FLOAT) + + return n1 * gaussian(x, height, 0, actual_sigma) + n2 * pos_uniform + n3 * neg_uniform From 09ebad27fcc0f075b90eab647ce56aafc2fdc76c Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 6 May 2023 20:19:29 -0400 Subject: [PATCH 07/56] MAINT: Fix removal of necessary modules Foolish error from switching branches without thinking. --- pybaselines/optimizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index d5e3c5c..74ad4b8 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -16,7 +16,7 @@ from . import classification, misc, morphological, polynomial, smooth, spline, whittaker from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array from ._validation import _check_optional_array -from .utils import _check_scalar, _get_edges, gaussian +from .utils import _check_scalar, _get_edges, gaussian, whittaker_smooth class _Optimizers(_Algorithm): From c554df1c564a4211e00a1df76f62e8aa8a2b7113 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 3 Sep 2023 18:23:15 -0400 Subject: [PATCH 08/56] FEAT: Implemented some 2D Whittaker baselines Added 2D versions of asls, airpls, arpls, iarpls, and psalsa. Extremely experimental. Note: currently using sparse implementation; a banded implementation is ~5 times faster but uses significantly more memory, so going with the sparse solution for now. --- pybaselines/_validation.py | 14 +- pybaselines/two_d/_algorithm_setup.py | 88 +++++- pybaselines/two_d/_spline_utils.py | 2 +- pybaselines/two_d/_validation.py | 391 ----------------------- pybaselines/two_d/_whittaker_utils.py | 203 ++++++++++++ pybaselines/two_d/api.py | 3 +- pybaselines/two_d/whittaker.py | 440 ++++++++++++++++++++++++++ 7 files changed, 737 insertions(+), 404 deletions(-) delete mode 100644 pybaselines/two_d/_validation.py create mode 100644 pybaselines/two_d/_whittaker_utils.py create mode 100644 pybaselines/two_d/whittaker.py diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index bc45180..13d3902 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -348,14 +348,14 @@ def _check_half_window(half_window, allow_zero=False): def _check_optional_array(data_size, array=None, dtype=None, order=None, check_finite=False, - copy_input=False, name='weights'): + copy_input=False, name='weights', ensure_1d=True, axis=-1): """ Validates the length of the input array or creates an array of ones if no input is given. Parameters ---------- - data_size : int - The length that the input should have. + data_size : int or Container[int, int] + The shape that the input should have. array : array-like, shape (`data_size`), optional The array to validate. Default is None, which will create an array of ones with length equal to `data_size`. @@ -371,6 +371,12 @@ def _check_optional_array(data_size, array=None, dtype=None, order=None, check_f which skips the check. name : str, optional The name for the variable if an exception is raised. Default is 'weights'. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). If False, + will ignore the shape of `array`. + axis : int, optional + The axis of the input on which to check its length. Default is -1. Returns ------- @@ -383,7 +389,7 @@ def _check_optional_array(data_size, array=None, dtype=None, order=None, check_f else: output_array = _check_sized_array( array, data_size, dtype=dtype, order=order, check_finite=check_finite, - ensure_1d=True, name=name + ensure_1d=ensure_1d, name=name, axis=axis ) if copy_input: output_array = output_array.copy() diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 377e620..1c3e1f2 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -15,7 +15,10 @@ from ..utils import ParameterWarning, _inverted_sort, pad_edges, relative_difference from ._spline_utils import PSpline2D -from ._validation import _check_array, _check_half_window, _check_optional_array, _check_scalar +from .._validation import ( + _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_sized_array +) +from ._whittaker_utils import PenalizedSystem2D class _Algorithm2D: @@ -77,22 +80,21 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non Unlike `_Algorithm`, `_2DAlgorithm` does not sort input data. """ + self._len = [None, None] if x_data is None: self.x = None self.x_domain = np.array([-1., 1.]) - self._len = None else: self.x = _check_array(x_data, check_finite=check_finite) - self._len = len(self.x) + self._len[1] = len(self.x) self.x_domain = np.polynomial.polyutils.getdomain(self.x) if z_data is None: self.z = None self.z_domain = np.array([-1., 1.]) - self._len = None else: self.z = _check_array(z_data, check_finite=check_finite) - self._len = len(self.z) + self._len[0] = len(self.z) self.z_domain = np.polynomial.polyutils.getdomain(self.z) self.whittaker_system = None @@ -185,7 +187,7 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d @wraps(func) def inner(self, data=None, *args, **kwargs): - """ # TODO add back in later + """ if self.x is None: if data is None: raise TypeError('"data" and "x_data" cannot both be None') @@ -212,7 +214,6 @@ def inner(self, data=None, *args, **kwargs): self.x = _check_array( self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False ) - """ y = data; input_y = True; reset_x = False; x_dtype = None # TODO remove later @@ -291,6 +292,79 @@ def _override_x(self, new_x, new_sort_order=None): self.whittaker_system = old_whittaker_system self.pspline = old_pspline + def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=False, + allow_lower=True, reverse_diags=None): + """ + Sets the starting parameters for doing penalized least squares. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + lam : float, optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on the penalized least square method + and the differential order. Default is 1. + diff_order : int, optional + The integer differential order; must be greater than 0. Default is 2. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + shape (N,) and all values set to 1. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + allow_lower : boolean, optional + If True (default), will allow using only the lower non-zero diagonals of + the squared difference matrix. If False, will include all non-zero diagonals. + reverse_diags : {None, False, True}, optional + If True, will reverse the order of the diagonals of the squared difference + matrix. If False, will never reverse the diagonals. If None (default), will + only reverse the diagonals if using pentapy's solver. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,), optional + The weighting array. + + Raises + ------ + ValueError + Raised is `diff_order` is less than 1. + + Warns + ----- + ParameterWarning + Raised if `diff_order` is greater than 3. + + """ + if diff_order < 1: + raise ValueError( + 'the difference order must be > 0 for Whittaker-smoothing-based methods' + ) + elif diff_order > 3: + warnings.warn( + ('difference orders greater than 3 can have numerical issues;' + ' consider using a difference order of 2 or 1 instead'), + ParameterWarning, stacklevel=2 + ) + weight_array = _check_optional_array( + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite + ).ravel() + #if self._sort_order is not None and weights is not None: + # weight_array = weight_array[self._sort_order] + + if self.whittaker_system is not None: + self.whittaker_system.reset_diagonals(lam, diff_order, allow_lower, reverse_diags) + else: + self.whittaker_system = PenalizedSystem2D( + self._len, lam, diff_order + ) + + return y.ravel(), weight_array + def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, calc_pinv=False, copy_weights=False): """ diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 7c1ab77..e192492 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -12,7 +12,7 @@ from .._banded_utils import difference_matrix from .._spline_utils import _spline_basis, _spline_knots -from ._validation import _check_array, _check_lam, _check_scalar +from .._validation import _check_array, _check_lam, _check_scalar class PSpline2D: diff --git a/pybaselines/two_d/_validation.py b/pybaselines/two_d/_validation.py deleted file mode 100644 index f34aa1e..0000000 --- a/pybaselines/two_d/_validation.py +++ /dev/null @@ -1,391 +0,0 @@ -# -*- coding: utf-8 -*- -"""Code for validating inputs. - -Created on April 16, 2023 -@author: Donald Erb - -""" - -import numpy as np - - -def _check_scalar(data, desired_length, fill_scalar=False, **asarray_kwargs): - """ - Checks if the input is scalar and potentially coerces it to the desired length. - - Only intended for one dimensional data. - - Parameters - ---------- - data : array-like - Either a scalar value or an array. Array-like inputs with only 1 item will also - be considered scalar. - desired_length : int - If `data` is an array, `desired_length` is the length the array must have. If `data` - is a scalar and `fill_scalar` is True, then `desired_length` is the length of the output. - fill_scalar : bool, optional - If True and `data` is a scalar, then will output an array with a length of - `desired_length`. Default is False, which leaves scalar values unchanged. - **asarray_kwargs : dict - Additional keyword arguments to pass to :func:`numpy.asarray`. - - Returns - ------- - output : numpy.ndarray or numpy.number - The array of values or the single array scalar, depending on the input parameters. - is_scalar : bool - True if the input was a scalar value or had a length of 1; otherwise, is False. - - Raises - ------ - ValueError - Raised if `data` is not a scalar and its length is not equal to `desired_length`. - - """ - output = np.asarray(data, **asarray_kwargs) - ndim = output.ndim - if not ndim: - is_scalar = True - else: - if ndim > 1: # coerce to 1d shape - output = output.reshape(-1) - len_output = len(output) - if len_output == 1: - is_scalar = True - output = np.asarray(output[0], **asarray_kwargs) - else: - is_scalar = False - - if is_scalar: - if fill_scalar: - output = np.full(desired_length, output) - else: - # index with an empty tuple to get the single scalar while maintaining the numpy dtype - output = output[()] - elif len_output != desired_length: - raise ValueError(f'desired length was {desired_length} but instead got {len_output}') - - return output, is_scalar - - -def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarray_kwargs): - """ - Ensures the input is a scalar value. - - Parameters - ---------- - value : float or array-like - The value to check. - allow_zero : bool, optional - If False (default), only allows `value` > 0. If True, allows `value` >= 0. - variable_name : str, optional - The name displayed if an error occurs. Default is 'lam'. - **asarray_kwargs : dict - Additional keyword arguments to pass to :func:`numpy.asarray`. - - Returns - ------- - output : float - The verified scalar value. - - Raises - ------ - ValueError - Raised if `value` is less than or equal to 0 if `allow_zero` is False or - less than 0 if `allow_zero` is True. - - """ - output = _check_scalar(value, 1, fill_scalar=False, **asarray_kwargs)[0] - if allow_zero: - operation = np.less - text = 'greater than or equal to' - else: - operation = np.less_equal - text = 'greater than' - if np.any(operation(output, 0)): - raise ValueError(f'{variable_name} must be {text} 0') - - # use an empty tuple to get the single scalar value - return output - - -def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True): - """ - Validates the shape and values of the input array and controls the output parameters. - - Parameters - ---------- - array : array-like - The input array to check. - dtype : type or numpy.dtype, optional - The dtype to cast the output array. Default is None, which uses the typing of `array`. - order : {None, 'C', 'F'}, optional - The order for the output array. Default is None, which will use the default array - ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. - check_finite : bool, optional - If True, will raise an error if any values in `array` are not finite. Default is False, - which skips the check. - ensure_1d : bool, optional - If True (default), will raise an error if the shape of `array` is not a one dimensional - array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - - Returns - ------- - output : numpy.ndarray - The array after performing all validations. - - Raises - ------ - ValueError - Raised if `ensure_1d` is True and `array` does not have a shape of (N,) or - (N, 1) or (1, N). - - Notes - ----- - If `ensure_1d` is True and `array` has a shape of (N, 1) or (1, N), it is reshaped to - (N,) for better compatibility for all functions. - - """ - if check_finite: - array_func = np.asarray_chkfinite - else: - array_func = np.asarray - output = array_func(array, dtype=dtype, order=order) - if ensure_1d: - output = np.array(output, copy=False, ndmin=1) - dimensions = output.ndim - if dimensions == 2 and 1 in output.shape: - output = output.reshape(-1) - elif dimensions != 1: - raise ValueError('must be a one dimensional array') - - return output - - -def _check_sized_array(array, length, dtype=None, order=None, check_finite=False, - ensure_1d=True, axis=-1, name='weights'): - """ - Validates the input array and ensures its length is correct. - - Parameters - ---------- - array : array-like - The input array to check. - length : int - The length that the input should have on the specified `axis`. - dtype : type or numpy.dtype, optional - The dtype to cast the output array. Default is None, which uses the typing of `array`. - order : {None, 'C', 'F'}, optional - The order for the output array. Default is None, which will use the default array - ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. - check_finite : bool, optional - If True, will raise an error if any values if `array` are not finite. Default is False, - which skips the check. - ensure_1d : bool, optional - If True (default), will raise an error if the shape of `array` is not a one dimensional - array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - axis : int, optional - The axis of the input on which to check its length. Default is -1. - name : str, optional - The name for the variable if an exception is raised. Default is 'weights'. - - Returns - ------- - output : numpy.ndarray - The array after performing all validations. - - Raises - ------ - ValueError - Raised if `array` does not match `length` on the given `axis`. - - """ - output = _check_array( - array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d - ) - if output.shape[axis] != length: - raise ValueError( - f'length mismatch for {name}; expected {length} but got {output.shape[axis]}' - ) - return output - - -def _yx_arrays(data, x_data=None, check_finite=False, dtype=None, order=None, ensure_1d=True, - axis=-1): - """ - Converts input data into numpy arrays and provides x data if none is given. - - Parameters - ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1. to 1. with N points. - check_finite : bool, optional - If True, will raise an error if any values if `array` are not finite. Default is False, - which skips the check. - dtype : type or numpy.dtype, optional - The dtype to cast the output array. Default is None, which uses the typing of `array`. - order : {None, 'C', 'F'}, optional - The order for the output array. Default is None, which will use the default array - ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. - ensure_1d : bool, optional - If True (default), will raise an error if the shape of `array` is not a one dimensional - array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - axis : int, optional - The axis of the input on which to check its length. Default is -1. - - Returns - ------- - y : numpy.ndarray, shape (N,) - A numpy array of the y-values of the measured data. - x : numpy.ndarray, shape (N,) - A numpy array of the x-values of the measured data, or a created array. - - Notes - ----- - Does not change the scale/domain of the input `x_data` if it is given, only - converts it to an array. - - """ - y = _check_array( - data, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d - ) - len_y = y.shape[axis] - if x_data is None: - x = np.linspace(-1, 1, len_y) - else: - x = _check_sized_array( - x_data, len_y, dtype=dtype, order=order, check_finite=check_finite, - ensure_1d=True, axis=0, name='x_data' - ) - - return y, x - - -def _check_lam(lam, allow_zero=False): - """ - Ensures the regularization parameter `lam` is a scalar greater than 0. - - Parameters - ---------- - lam : float or array-like - The regularization parameter, lambda, used in Whittaker smoothing and - penalized splines. - allow_zero : bool - If False (default), only allows `lam` values > 0. If True, allows `lam` >= 0. - - Returns - ------- - float - The scalar `lam` value. - - Raises - ------ - ValueError - Raised if `lam` is less than or equal to 0. - - Notes - ----- - Array-like `lam` values could be permitted, but they require using the full - banded penalty matrix. Many functions use only half of the penalty matrix due - to its symmetry; that symmetry is broken when using an array for `lam`, so allowing - an array `lam` would change how the system is solved. Further, array-like `lam` - values with large changes in scale cause some instability and/or discontinuities - when using Whittaker smoothing or penalized splines. Thus, it is easier and better - to only allow scalar `lam` values. - - TODO will maybe change this in the future to allow array-like `lam`, and the - solver will be determined based on that; however, until then, want to ensure users - don't unknowingly use an array-like `lam` when it doesn't work. - NOTE for future: if multiplying an array `lam` with the penalties in banded format, - do not reverse the order (ie. keep it like the output of sparse.dia.data), multiply - by the array, and then shift the rows based on the difference order (same procedure - as done for aspls). That will give the same output as - ``(diags(lam) @ D.T @ D).todia().data[::-1]``. - - """ - return _check_scalar_variable(lam, allow_zero) - - -def _check_half_window(half_window, allow_zero=False): - """ - Ensures the half-window is an integer and has an appropriate value. - - Parameters - ---------- - half_window : int, optional - The half-window used for the smoothing functions. Used - to pad the left and right edges of the data to reduce edge - effects. Default is 0, which provides no padding. - allow_zero : bool, optional - If True, allows `half_window` to be 0; otherwise, `half_window` - must be at least 1. Default is False. - - Returns - ------- - output_half_window : int - The verified half-window value. - - Raises - ------ - TypeError - Raised if the integer converted `half_window` is not equal to the input - `half_window`. - - """ - output_half_window = _check_scalar_variable( - half_window, allow_zero, 'half_window', dtype=np.intp - ) - if output_half_window != half_window: - raise TypeError('half_window must be an integer') - - return output_half_window - - -def _check_optional_array(data_size, array=None, dtype=None, order=None, check_finite=False, - copy_input=False, name='weights', ensure_1d=True): - """ - Validates the length of the input array or creates an array of ones if no input is given. - - Parameters - ---------- - data_size : int - The length that the input should have. - array : array-like, shape (`data_size`), optional - The array to validate. Default is None, which will create an array of ones with length - equal to `data_size`. - copy_input : bool, optional - If True, returns a copy of the input `array` if it is not None. Default is False. - dtype : type or numpy.dtype, optional - The dtype to cast the output array. Default is None, which uses the typing of `array`. - order : {None, 'C', 'F'}, optional - The order for the output array. Default is None, which will use the default array - ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. - check_finite : bool, optional - If True, will raise an error if any values if `array` are not finite. Default is False, - which skips the check. - name : str, optional - The name for the variable if an exception is raised. Default is 'weights'. - ensure_1d : bool, optional - If True (default), will raise an error if the shape of `array` is not a one dimensional - array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - - Returns - ------- - output_array : numpy.ndarray, shape (`data_size`) - The validated array or the new ones array. - - """ - if array is None: - output_array = np.ones(data_size) - else: - output_array = _check_sized_array( - array, data_size, dtype=dtype, order=order, check_finite=check_finite, - name=name, ensure_1d=ensure_1d, axis=slice(None) # TODO change axis later - ) - if copy_input: - output_array = output_array.copy() - - return output_array diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py new file mode 100644 index 0000000..8aee2cd --- /dev/null +++ b/pybaselines/two_d/_whittaker_utils.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +"""Helper functions for working with penalized linear systems. + +Created on April 30, 2023 +@author: Donald Erb + +""" + +from scipy.sparse import identity, kron +from scipy.sparse.linalg import spsolve + +from .._banded_utils import difference_matrix +from .._validation import _check_lam, _check_scalar + + +class PenalizedSystem2D: + """ + An object for setting up and solving penalized least squares linear systems. + + Attributes + ---------- + diff_order : int + The difference order of the penalty. + lower : bool + If True, the penalty uses only the lower bands of the symmetric banded penalty. Will + use :func:`scipy.linalg.solveh_banded` for solving. If False, contains both the upper + and lower bands of the penalty and will use either :func:`scipy.linalg.solve_banded` + (if `using_pentapy` is False) or :func:`._pentapy_solver` when solving. + main_diagonal_index : int + The index of the main diagonal for `penalty`. Is updated when adding additional matrices + to the penalty, and takes into account whether the penalty is only the lower bands or + the total bands. + num_bands : int + The number of bands in the penalty. The number of bands is assumbed to be symmetric, + so the number of upper and lower bands should both be equal to `num_bands`. + original_diagonals : numpy.ndarray + The original penalty diagonals before multiplying by `lam` or adding any padding. + Maintained so that repeated computations with different `lam` values can be quickly + set up. `original_diagonals` can be either the full or lower bands of the penalty, + and may be reveresed, it depends on the set up. Reset by calling + :meth:`.reset_diagonals`. + penalty : numpy.ndarray + The current penalty. Originally is `original_diagonals` after multiplying by `lam` + and applying padding, but can also be changed by calling :meth:`.add_penalty`. + Reset by calling :meth:`.reset_diagonals`. + reversed : bool + If True, the penalty is reversed of the typical LAPACK banded format. Useful if + multiplying the penalty with an array since the rows get shifted, or if using pentapy's + solver. + using_pentapy : bool + If True, will use pentapy's solver when solving. + + """ + + def __init__(self, data_size, lam=1, diff_order=2, allow_lower=True, + reverse_diags=None, allow_pentapy=True, padding=0): + """ + Initializes the banded system. + + Parameters + ---------- + data_size : int + The number of data points for the system. + lam : float, optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int, optional + The difference order of the penalty. Default is 2 (second order difference). + allow_lower : bool, optional + If True (default), will allow only using the lower bands of the penalty matrix, + which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly + slower :func:`scipy.linalg.solve_banded`. + reverse_diags : {None, False, True}, optional + If True, will reverse the order of the diagonals of the squared difference + matrix. If False, will never reverse the diagonals. If None (default), will + only reverse the diagonals if using pentapy's solver. + allow_pentapy : bool, optional + If True (default), will allow using pentapy's solver if `diff_order` is 2 + and pentapy is installed. pentapy's solver is faster than scipy's banded solvers. + padding : int, optional + The number of extra layers of zeros to add to the bottom and potentially + the top if the full bands are used. Default is 0, which adds no extra + layers. Negative `padding` is treated as equivalent to 0. + + """ + self.shape = data_size + self.original_diagonals = None + + self.diff_order = _check_scalar(diff_order, 2, True)[0] + self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + D1 = difference_matrix(self.shape[0], self.diff_order[0]) + D2 = difference_matrix(self.shape[1], self.diff_order[1]) + + P1 = self.lam[0] * kron(D1.T @ D1, identity(self.shape[1])) + P2 = self.lam[1] * kron(identity(self.shape[0]), D2.T @ D2) + + self.penalty = P1 + P2 + + def add_penalty(self, penalty): + """ + Updates `self.penalty` with an additional penalty and updates the bands. + + Parameters + ---------- + penalty : array-like + The additional penalty to add to `self.penalty`. + + Returns + ------- + numpy.ndarray + The updated `self.penalty`. + + """ + + def reset_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=None, + allow_pentapy=True, padding=0): + """ + Resets the diagonals of the system and all of the attributes. + + Useful for reusing the penalized system for a different `lam` value. + + Parameters + ---------- + lam : float, optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int, optional + The difference order of the penalty. Default is 2 (second order difference). + allow_lower : bool, optional + If True (default), will allow only using the lower bands of the penalty matrix, + which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly + slower :func:`scipy.linalg.solve_banded`. + reverse_diags : {None, False, True}, optional + If True, will reverse the order of the diagonals of the squared difference + matrix. If False, will never reverse the diagonals. If None (default), will + only reverse the diagonals if using pentapy's solver. + allow_pentapy : bool, optional + If True (default), will allow using pentapy's solver if `diff_order` is 2 + and pentapy is installed. pentapy's solver is faster than scipy's banded solvers. + padding : int, optional + The number of extra layers of zeros to add to the bottom and potentially + the top if the full bands are used. Default is 0, which adds no extra + layers. Negative `padding` is treated as equivalent to 0. + + """ + + def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, + check_finite=False, l_and_u=None, check_output=False): + """ + Solves the equation ``A @ x = rhs``, given `A` in banded format as `lhs`. + + Parameters + ---------- + lhs : array-like, shape (M, N) + The left-hand side of the equation, in banded format. `lhs` is assumed to be + some slight modification of `self.penalty` in the same format (reversed, lower, + number of bands, etc. are all the same). + rhs : array-like, shape (N,) + The right-hand side of the equation. + overwrite_ab : bool, optional + Whether to overwrite `lhs` when using :func:`scipy.linalg.solveh_banded` or + :func:`scipy.linalg.solve_banded`. Default is False. + overwrite_b : bool, optional + Whether to overwrite `rhs` when using :func:`scipy.linalg.solveh_banded` or + :func:`scipy.linalg.solve_banded`. Default is False. + check_finite : bool, optional + Whether to check if the inputs are finite when using + :func:`scipy.linalg.solveh_banded` or :func:`scipy.linalg.solve_banded`. + Default is False. + l_and_u : Container(int, int), optional + The number of lower and upper bands in `lhs` when using + :func:`scipy.linalg.solve_banded`. Default is None, which uses + (``len(lhs) // 2``, ``len(lhs) // 2``). + check_output : bool, optional + If True, will check the output for non-finite values when using + :func:`._pentapy_solver`. Default is False. + + Returns + ------- + output : numpy.ndarray, shape (N,) + The solution to the linear system, `x`. + + """ + output = spsolve(lhs, rhs, permc_spec='NATURAL') + + return output + + def reverse_penalty(self): + """ + Reverses the penalty and original diagonals for the system. + + Raises + ------ + ValueError + Raised if `self.lower` is True, since reversing the half diagonals does + not make physical sense. + + """ + if self.lower: + raise ValueError('cannot reverse diagonals when self.lower is True') + self.penalty = self.penalty[::-1] + self.original_diagonals = self.original_diagonals[::-1] + self.reversed = not self.reversed diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index 5085ba5..57d9a19 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -10,10 +10,11 @@ from .polynomial import _Polynomial from .smooth import _Smooth from .spline import _Spline +from .whittaker import _Whittaker class Baseline2D( - _Morphological, _Polynomial, _Smooth, _Spline + _Morphological, _Polynomial, _Smooth, _Spline, _Whittaker ): """ A class for all 2D baseline correction algorithms. diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py new file mode 100644 index 0000000..3ed2e5e --- /dev/null +++ b/pybaselines/two_d/whittaker.py @@ -0,0 +1,440 @@ +# -*- coding: utf-8 -*- +"""Whittaker-smoothing-based techniques for fitting baselines to experimental data. + +Created on April 30, 2023 +@author: Donald Erb + +""" + +import warnings + +import numpy as np + +from .. import _weighting +from ._algorithm_setup import _Algorithm2D +from ..utils import ( + ParameterWarning, relative_difference +) + + +class _Whittaker(_Algorithm2D): + """A base class for all Whittaker-smoothing-based algorithms.""" + + @_Algorithm2D._register(sort_keys=('weights',)) + def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + Fits the baseline using asymmetric least squares (AsLS) fitting. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + References + ---------- + Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636. + + Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. + Leiden University Medical Centre Report, 2005, 1(1). + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + main_diag_idx = self.whittaker_system.main_diagonal_index + main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + baseline = self.whittaker_system.solve( + self.whittaker_system.penalty, weight_array * y, overwrite_b=True + ) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + Adaptive iteratively reweighted penalized least squares (airPLS) baseline. + + Parameters + ---------- + data : array-like + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e6. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Zhang, Z.M., et al. Baseline correction using adaptive iteratively + reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + + """ + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, copy_weights=True + ) + y_l1_norm = np.abs(y).sum() + main_diag_idx = self.whittaker_system.main_diagonal_index + main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + tol_history = np.empty(max_iter + 1) + # Have to have extensive error handling since the weights can all become + # very small due to the exp(i) term if too many iterations are performed; + # checking the negative residual length usually prevents any errors, but + # sometimes not so have to also catch any errors from the solvers + for i in range(1, max_iter + 2): + self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + try: + output = self.whittaker_system.solve( + self.whittaker_system.penalty, weight_array * y, overwrite_b=True, + check_output=True + ) + except np.linalg.LinAlgError: + warnings.warn( + ('error occurred during fitting, indicating that "tol"' + ' is too low, "max_iter" is too high, or "lam" is too high'), + ParameterWarning + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + else: + baseline = output + residual = y - baseline + neg_mask = residual < 0 + neg_residual = residual[neg_mask] + if len(neg_residual) < 2: + # exit if there are < 2 negative residuals since all points or all but one + # point would get a weight of 0, which fails the solver + warnings.warn( + ('almost all baseline points are below the data, indicating that "tol"' + ' is too low and/or "max_iter" is too high'), ParameterWarning + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + + residual_l1_norm = abs(neg_residual.sum()) + calc_difference = residual_l1_norm / y_l1_norm + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + # only use negative residual in exp to avoid exponential overflow warnings + # and accidently creating a weight of nan (inf * 0 = nan) + weight_array[neg_mask] = np.exp(i * neg_residual / residual_l1_norm) + weight_array[~neg_mask] = 0 + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) + def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + Asymmetrically reweighted penalized least squares smoothing (arPLS). + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Baek, S.J., et al. Baseline correction using asymmetrically reweighted + penalized least squares smoothing. Analyst, 2015, 140, 250-257. + + """ + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + main_diagonal = self.whittaker_system.penalty.diagonal() + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) + baseline = self.whittaker_system.solve( + self.whittaker_system.penalty, weight_array * y + ) + new_weights = _weighting._arpls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Ye, J., et al. Baseline correction method based on improved asymmetrically + reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, + 59, 10933-10943. + + """ + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + main_diag_idx = self.whittaker_system.main_diagonal_index + main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + baseline = self.whittaker_system.solve( + self.whittaker_system.penalty, weight_array * y, overwrite_b=True + ) + new_weights = _weighting._iarpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, + weights=None): + """ + Peaked Signal's Asymmetric Least Squares Algorithm (psalsa). + + Similar to the asymmetric least squares (AsLS) algorithm, but applies an + exponential decay weighting to values greater than the baseline to allow + using a higher `p` value to better fit noisy data. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 0.5. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`.asls`. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + Notes + ----- + The exit criteria for the original algorithm was to check whether the signs + of the residuals do not change between two iterations, but the comparison of + the l2 norms of the weight arrays between iterations is used instead to be + more comparable to other Whittaker-smoothing-based algorithms. + + References + ---------- + Oller-Moreno, S., et al. Adaptive Asymmetric Least Squares baseline estimation + for analytical instruments. 2014 IEEE 11th International Multi-Conference on + Systems, Signals, and Devices, 2014, 1-5. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + if k is None: + k = np.std(y) / 10 + main_diag_idx = self.whittaker_system.main_diagonal_index + main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + baseline = self.whittaker_system.solve( + self.whittaker_system.penalty, weight_array * y, overwrite_b=True + ) + new_weights = _weighting._psalsa(y, baseline, p, k, self._len) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params From ac8f2e1481daa1ac0f6fc2322a44968eae6f0061 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 10 Sep 2023 15:40:40 -0400 Subject: [PATCH 09/56] FEAT: Allow separate polynomial orders for x and z Also allow limiting the cross terms for simpler fits. --- pybaselines/two_d/_algorithm_setup.py | 54 ++++++++++++------- pybaselines/two_d/polynomial.py | 77 ++++++++++++++++++--------- 2 files changed, 88 insertions(+), 43 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 1c3e1f2..d2c1162 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -16,7 +16,7 @@ from ..utils import ParameterWarning, _inverted_sort, pad_edges, relative_difference from ._spline_utils import PSpline2D from .._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_sized_array + _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_scalar_variable ) from ._whittaker_utils import PenalizedSystem2D @@ -366,7 +366,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa return y.ravel(), weight_array def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, - calc_pinv=False, copy_weights=False): + calc_pinv=False, copy_weights=False, max_cross=None): """ Sets the starting parameters for doing polynomial fitting. @@ -378,8 +378,8 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. - poly_order : int, optional - The polynomial order. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. calc_vander : bool, optional If True, will calculate and the Vandermonde matrix. Default is False. calc_pinv : bool, optional @@ -388,6 +388,10 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, copy_weights : boolean, optional If True, will copy the array of input weights. Only needed if the algorithm changes the weights in-place. Default is False. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -404,13 +408,6 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ValueError Raised if `calc_pinv` is True and `calc_vander` is False. - Notes - ----- - If x_data is given, its domain is reduced from ``[min(x_data), max(x_data)]`` - to [-1., 1.] to improve the numerical stability of calculations; since the - Vandermonde matrix goes from ``x**0`` to ``x^**poly_order``, large values of - x would otherwise cause difficulty when doing least squares minimization. - """ weight_array = _check_optional_array( y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape @@ -419,9 +416,13 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, # TODO #if self._sort_order is not None and weights is not None: # weight_array = weight_array[self._sort_order] - + poly_orders = _check_scalar(poly_order, 2, True)[0] + if max_cross is not None: + max_cross = _check_scalar_variable( + max_cross, allow_zero=True, variable_name='max_cross' + ) if calc_vander: - if self.vandermonde is None or poly_order > self.poly_order: + if self.vandermonde is None or self._max_cross != max_cross: mapped_x = np.polynomial.polyutils.mapdomain( self.x, self.x_domain, np.array([-1., 1.]) ) @@ -431,12 +432,27 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, # rearrange the vandermonde such that it matches the typical A c = b where b # is the flattened version of y and c are the coefficients self.vandermonde = np.polynomial.polynomial.polyvander2d( - *np.meshgrid(mapped_x, mapped_z), [poly_order, poly_order] - ).reshape((-1, (poly_order + 1) * (poly_order + 1))) - - elif poly_order < self.poly_order: - pass #self.vandermonde = self.vandermonde[:, :poly_order + 1] - self.poly_order = poly_order + *np.meshgrid(mapped_x, mapped_z), [poly_orders[0], poly_orders[1]] + ).reshape((-1, (poly_orders[0] + 1) * (poly_orders[1] + 1))) + + if max_cross is not None: + # probably a smarter way to accomplish this... but it works + # TODO see if there is a way to list out (i, j) coefficients + # so that I can just filter out any (i, j) pairing > max_cross + z_coefs = np.arange(poly_orders[1] + 1) + x_coefs = np.arange(poly_orders[0] + 1) + z_coefs[:min(max_cross, poly_orders[1]) + 1] = 0 + x_coefs[:min(max_cross, poly_orders[0]) + 1] = 0 + total_coefs = z_coefs + x_coefs[:, None] + # include all pure x and z terms + total_coefs[:, 0] = 0 + total_coefs[0, :] = 0 + for idx, val in enumerate(total_coefs.reshape(-1)): + if val != 0: + self.vandermonde[:, idx] = 0 + + self.poly_order = poly_orders + self._max_cross = max_cross y = y.ravel() if not calc_pinv: return y, weight_array diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index 6e3d535..19c1844 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -88,7 +88,7 @@ class _Polynomial(_Algorithm2D): @_Algorithm2D._register( sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) ) - def poly(self, data, poly_order=2, weights=None, return_coef=False): + def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=None): """ Computes a polynomial that fits the baseline of the data. @@ -96,8 +96,8 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -105,6 +105,10 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -127,7 +131,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): """ y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross ) sqrt_w = np.sqrt(weight_array) @@ -143,7 +147,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) ) def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=False, return_coef=False): + use_original=False, mask_initial_peaks=False, return_coef=False, max_cross=None): """ The modified polynomial (ModPoly) baseline algorithm. @@ -154,8 +158,8 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, x_data : array-like, shape (N,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 with N points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional @@ -174,6 +178,10 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -212,7 +220,8 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, """ y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True, + max_cross=max_cross ) sqrt_w = np.sqrt(weight_array) if use_original: @@ -247,7 +256,8 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) ) def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1.): + use_original=False, mask_initial_peaks=True, return_coef=False, + num_std=1., max_cross=None): """ The improved modofied polynomial (IModPoly) baseline algorithm. @@ -255,8 +265,8 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional @@ -278,6 +288,10 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, num_std : float, optional The number of standard deviations to include when thresholding. Default is 1. Must be greater or equal to 0. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -324,7 +338,8 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, raise ValueError('num_std must be greater than or equal to 0') y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + data, weights, poly_order, calc_vander=True, calc_pinv=True, + copy_weights=True, max_cross=max_cross ) sqrt_w = np.sqrt(weight_array) if use_original: @@ -365,7 +380,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, ) def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, cost_function='asymmetric_truncated_quadratic', threshold=None, - alpha_factor=0.99, return_coef=False): + alpha_factor=0.99, return_coef=False, max_cross=None): """ Fits a polynomial baseline using a non-quadratic cost function. @@ -376,8 +391,8 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional @@ -413,6 +428,10 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -461,7 +480,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non }[method] y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross ) if threshold is None: threshold = np.std(y) / 10 @@ -494,7 +513,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) ) def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, - weights=None, eps=None, return_coef=False): + weights=None, eps=None, return_coef=False, max_cross=None): """ Approximates the baseline of the data using quantile regression. @@ -502,8 +521,8 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. quantile : float, optional The quantile at which to fit the baseline. Default is 0.05. tol : float, optional @@ -524,6 +543,10 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input `x_data` and return them in the params dictionary. Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -570,7 +593,9 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, if not 0 < quantile < 1: raise ValueError('quantile must be between 0 and 1.') - y, weight_array = self._setup_polynomial(data, weights, poly_order, calc_vander=True) + y, weight_array = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, max_cross=max_cross + ) # estimate first iteration using least squares sqrt_w = np.sqrt(weight_array) coef = np.linalg.lstsq(self.vandermonde * sqrt_w[:, None], y * sqrt_w, None)[0] @@ -599,7 +624,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, ) def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, cost_function='asymmetric_indec', peak_ratio=0.5, alpha_factor=0.99, - tol_2=1e-3, tol_3=1e-6, max_iter_2=100, return_coef=False): + tol_2=1e-3, tol_3=1e-6, max_iter_2=100, return_coef=False, max_cross=None): """ Fits a polynomial baseline using a non-quadratic cost function. @@ -610,8 +635,8 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - poly_order : int, optional - The polynomial order for fitting the baseline. Default is 2. + poly_order : int or Container[int, int], optional + The polynomial orders for x and z. Default is 2. tol : float, optional The exit criteria for the fitting with a given threshold value. Default is 1e-3. max_iter : int, optional @@ -648,6 +673,10 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. Returns ------- @@ -714,7 +743,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, 'indec': _indec_loss }[method] y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross ) up_down_ratio_goal = ( 0.7679 + 11.2358 * peak_ratio - 39.7064 * peak_ratio**2 + 92.3583 * peak_ratio**3 From 5c7149c0bd992ca5bb6fa3960ebd91eadb65821d Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 10 Sep 2023 15:58:01 -0400 Subject: [PATCH 10/56] MAINT: Improve polynomial cross term matching --- pybaselines/two_d/_algorithm_setup.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index d2c1162..9ecad1c 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -8,6 +8,7 @@ from contextlib import contextmanager from functools import partial, wraps +import itertools import warnings import numpy as np @@ -422,7 +423,10 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, max_cross, allow_zero=True, variable_name='max_cross' ) if calc_vander: - if self.vandermonde is None or self._max_cross != max_cross: + if ( + self.vandermonde is None or self._max_cross != max_cross + or np.any(self.poly_order != poly_order) + ): mapped_x = np.polynomial.polyutils.mapdomain( self.x, self.x_domain, np.array([-1., 1.]) ) @@ -436,19 +440,12 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ).reshape((-1, (poly_orders[0] + 1) * (poly_orders[1] + 1))) if max_cross is not None: - # probably a smarter way to accomplish this... but it works - # TODO see if there is a way to list out (i, j) coefficients - # so that I can just filter out any (i, j) pairing > max_cross - z_coefs = np.arange(poly_orders[1] + 1) - x_coefs = np.arange(poly_orders[0] + 1) - z_coefs[:min(max_cross, poly_orders[1]) + 1] = 0 - x_coefs[:min(max_cross, poly_orders[0]) + 1] = 0 - total_coefs = z_coefs + x_coefs[:, None] - # include all pure x and z terms - total_coefs[:, 0] = 0 - total_coefs[0, :] = 0 - for idx, val in enumerate(total_coefs.reshape(-1)): - if val != 0: + # lists out (z_0, x_0), (z_1, x_0), etc + for idx, val in enumerate( + itertools.product(range(poly_orders[0] + 1), range(poly_orders[1] + 1)) + ): + # 0 designates pure z or x terms + if 0 not in val and any(v > max_cross for v in val): self.vandermonde[:, idx] = 0 self.poly_order = poly_orders From bd4018bcb80748b3345a32bee2eb2e0a4a69c71b Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 2 Jan 2024 09:28:17 -0500 Subject: [PATCH 11/56] OTHER: Allow converting polynomial coefficients in 2D Also fixed docstrings to correctly refer to numpy's Polynomial class. --- pybaselines/classification.py | 4 +- pybaselines/polynomial.py | 32 +++++------ pybaselines/two_d/polynomial.py | 26 ++++----- pybaselines/utils.py | 95 +++++++++++++++++++++++++++------ tests/test_utils.py | 90 +++++++++++++++++++++++++++++++ 5 files changed, 199 insertions(+), 48 deletions(-) diff --git a/pybaselines/classification.py b/pybaselines/classification.py index af04ce6..9bcc4b7 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -225,7 +225,7 @@ def dietrich(self, data, smooth_half_window=None, num_std=3.0, interp_half_windo * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True and `max_iter` is greater than 0. The array of polynomial coefficients for the baseline, in increasing order. Can be - used to create a polynomial using numpy.polynomial.polynomial.Polynomial(). + used to create a polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. * 'tol_history': numpy.ndarray Only if `max_iter` is greater than 1. An array containing the calculated tolerance values for each iteration. The length of the array is the number @@ -1230,7 +1230,7 @@ def dietrich(data, x_data=None, smooth_half_window=None, num_std=3.0, * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True and `max_iter` is greater than 0. The array of polynomial coefficients for the baseline, in increasing order. Can be - used to create a polynomial using numpy.polynomial.polynomial.Polynomial(). + used to create a polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. * 'tol_history': numpy.ndarray Only if `max_iter` is greater than 1. An array containing the calculated tolerance values for each iteration. The length of the array is the number diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py index f1791c3..81df97d 100644 --- a/pybaselines/polynomial.py +++ b/pybaselines/polynomial.py @@ -120,7 +120,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -192,7 +192,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -294,7 +294,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -427,7 +427,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -580,8 +580,8 @@ def loess(self, data, fraction=0.2, total_points=None, poly_order=1, scale=3.0, * 'coef': numpy.ndarray, shape (N, poly_order + 1) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial - using numpy.polynomial.polynomial.Polynomial(). If `delta` is > 0, the - coefficients for any skipped x-value will all be 0. + using :class:`numpy.polynomial.polynomial.Polynomial`. If `delta` is > 0, + the coefficients for any skipped x-value will all be 0. Raises ------ @@ -754,7 +754,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -884,7 +884,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -1034,7 +1034,7 @@ def poly(data, x_data=None, poly_order=2, weights=None, return_coef=False): * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1095,7 +1095,7 @@ def modpoly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=Non * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1170,7 +1170,7 @@ def imodpoly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=No * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1468,7 +1468,7 @@ def penalized_poly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -2029,8 +2029,8 @@ def loess(data, x_data=None, fraction=0.2, total_points=None, poly_order=1, scal * 'coef': numpy.ndarray, shape (N, poly_order + 1) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial - using numpy.polynomial.polynomial.Polynomial(). If `delta` is > 0, the - coefficients for any skipped x-value will all be 0. + using :class:`numpy.polynomial.polynomial.Polynomial`. If `delta` is > 0, + the coefficients for any skipped x-value will all be 0. Raises ------ @@ -2124,7 +2124,7 @@ def quant_reg(data, x_data=None, poly_order=2, quantile=0.05, tol=1e-6, max_iter * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -2230,7 +2230,7 @@ def goldindec(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=N * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index 19c1844..c668020 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -78,7 +78,7 @@ from .. import _weighting from ._algorithm_setup import _Algorithm2D from ..utils import ( - _MIN_FLOAT, _convert_coef, relative_difference + _MIN_FLOAT, _convert_coef2d, relative_difference ) @@ -122,7 +122,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Notes ----- @@ -139,7 +139,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No baseline = self.vandermonde @ coef params = {'weights': weight_array} if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params @@ -200,7 +200,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Notes ----- @@ -248,7 +248,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params @@ -310,7 +310,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Raises ------ @@ -368,7 +368,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params @@ -450,7 +450,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Raises ------ @@ -505,7 +505,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params @@ -565,7 +565,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Raises ------ @@ -615,7 +615,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params @@ -703,7 +703,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. Raises ------ @@ -811,7 +811,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, 'threshold': loss_kwargs['threshold'] } if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) + params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) return baseline, params diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 17fd2e7..eac321d 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -337,6 +337,48 @@ def _interp_inplace(x, y, y_start, y_end): return y +def _poly_transform_matrix(num_coefficients, original_domain): + """ + Creates the matrix that transforms polynomial coefficents from one domain to another. + + The polynomial coefficient array `d` computed with `v` can be transformed to the + coefficient array `c` computed with `x` where ``v = scale * x + offset`` by applying + ``c = T @ d``, where `T` is the transformation matrix. + + Parameters + ---------- + num_coefficients : int + The number of polynomial coefficients, ie. the polynomial degree + 1. + original_domain : Container[float, float] + The domain, [min(x), max(x)], of the original data used for fitting. + + Returns + ------- + transformation : numpy.ndarray, shape (`num_coefficients`, `num_coefficients`) + The transformation matrix to convert domains. + + Notes + ----- + The calculation of the transformation matrix is based on the math from + https://stackoverflow.com/questions/141422/how-can-a-transform-a-polynomial-to-another-coordinate-system#comment57358951_142436. + + This function assumes the original coefficients were computed with the domain [-1, 1]. + + """ + offset, scale = np.polynomial.polyutils.mapparms(np.array([-1., 1.]), original_domain) + transformation = np.zeros((num_coefficients, num_coefficients)) + skip_offset = np.equal(offset, 0) # 0 raised to negative powers causes nan + for i in range(num_coefficients): + for j in range(num_coefficients): + if skip_offset: + if j == i: + transformation[i, j] = binom(j, i) * (scale)**(-j) + else: + transformation[i, j] = binom(j, i) * (scale)**(-j) * (-offset)**(j - i) + + return transformation + + def _convert_coef(coef, original_domain): """ Scales the polynomial coefficients back to the original domain of the data. @@ -348,41 +390,60 @@ def _convert_coef(coef, original_domain): Parameters ---------- - coef : array-like + coef : numpy.ndarray, shape (a,) The array of coefficients for the polynomial. Should increase in order, for example (c0, c1, c2) from `y = c0 + c1 * x + c2 * x**2`. - original_domain : array-like, shape (2,) + original_domain : Container[float, float] The domain, [min(x), max(x)], of the original data used for fitting. Returns ------- - numpy.ndarray + numpy.ndarray, shape (a,) The array of coefficients scaled for the original domain. Notes ----- - Based on https://stackoverflow.com/questions/141422/how-can-a-transform-a-polynomial-to-another-coordinate-system#comment57358951_142436. - Could slightly reduce computation time by computing offset and scale once within the _Algorithm object, but doing it this way with `original_domain` is backwards compatible and this function is probably not called enough to justify the change. """ - offset, scale = np.polynomial.polyutils.mapparms(np.array([-1, 1]), original_domain) - num_coefficients = len(coef) - transformation = np.zeros((num_coefficients, num_coefficients)) - skip_offset = np.equal(offset, 0) # 0 raised to negative powers causes nan - for i in range(num_coefficients): - for j in range(num_coefficients): - if skip_offset: - if j == i: - transformation[i, j] = binom(j, i) * (scale)**(-j) - else: - transformation[i, j] = binom(j, i) * (scale)**(-j) * (-offset)**(j - i) - + transformation = _poly_transform_matrix(coef.shape[0], original_domain) return transformation @ coef +def _convert_coef2d(coef, original_x_domain, original_z_domain): + """ + Scales the polynomial coefficients back to the original domain of the data. + + For fitting, the x-values and z-values are scaled from their original domain, + [min(x), max(x)] and [min(z), max(z)], to [-1, 1] in order to improve the numerical + stability of fitting. This function rescales the retrieved polynomial coefficients + for the fit x-values and z-values back to their original domains. + + Parameters + ---------- + coef : numpy.ndarray, shape (a, b) + The 2d array of coefficients for the polynomial. Should increase in + order. The shape should be (a, b), where a is the polynomial degree + 1 for + the x-values and b is the polynomial degree + 1 for the z-values. + original_x_domain : Container[float, float] + The domain, [min(x), max(x)], of the original x-values used for fitting. + original_z_domain : Container[float, float] + The domain, [min(z), max(z)], of the original z-values used for fitting. + + Returns + ------- + numpy.ndarray, shape (a, b) + The array of coefficients scaled for the original domains. + + """ + transformation_x = _poly_transform_matrix(coef.shape[0], original_x_domain) + transformation_z = _poly_transform_matrix(coef.shape[1], original_z_domain) + + return transformation_x @ coef @ transformation_z.T + + def difference_matrix(data_size, diff_order=2, diff_format=None): """ Creates an n-order finite-difference matrix. diff --git a/tests/test_utils.py b/tests/test_utils.py index b938745..f647032 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -122,6 +122,25 @@ def test_interp_inplace(): assert_allclose(y_calc, y_actual, 1e-12) +@pytest.mark.parametrize('scale', (1., 10., 0.557)) +@pytest.mark.parametrize('num_coeffs', (1, 2, 5)) +def test_poly_transform_matrix(scale, num_coeffs): + """ + Tests the matrix that transforms polynomial coefficients from one domain to another. + + Only tests the simple cases where the offset is 0 since more complicated cases are + handled by the _convert_coef and _convert_coef2d tests. + """ + transform_matrix = np.eye(num_coeffs) + for i in range(num_coeffs): + transform_matrix[i, i] /= scale**i + + domain = np.array([-1, 1]) * scale + calc_matrix = utils._poly_transform_matrix(num_coeffs, domain) + + assert_allclose(calc_matrix, transform_matrix, atol=1e-12, rtol=1e-14) + + @pytest.mark.parametrize('x', (np.array([-5, -2, 0, 1, 8]), np.array([1, 2, 3, 4, 5]))) @pytest.mark.parametrize( 'coefs', ( @@ -146,6 +165,77 @@ def test_convert_coef(x, coefs): assert_allclose(converted_coefs, coefs, atol=1e-10) +@pytest.mark.parametrize('x', (np.linspace(-1, 1, 50), np.linspace(-13.5, 11.6, 51))) +@pytest.mark.parametrize('z', (np.linspace(-1, 1, 50), np.linspace(-13.5, 11.6, 51))) +@pytest.mark.parametrize( + 'coef', ( + np.array([ + [1, 0], + [1, 0] + ]), + np.array([ + [1, 1], + [0, 0] + ]), + np.array([ + [1, 0.1, 0.3, -0.5], + [1, 0.1, 0, 1], + [0.2, 0, 1.5, -0.3] + ]), + ) +) +def test_convert_coef2d(x, z, coef): + """ + Checks that polynomial coefficients are correctly converted to the original domain. + + Notes on the tested x and z values: Data from [-1, 1] has an offset of 0 and a scale + of 1, so the coefficients are unaffected, while the second set of values has an offset + not equal to 0 and a scale not equal to 1 so should be a good test of whether the + conversion is successful. + + """ + x_domain = np.polynomial.polyutils.getdomain(x) + mapped_x = np.polynomial.polyutils.mapdomain( + x, x_domain, np.array([-1., 1.]) + ) + z_domain = np.polynomial.polyutils.getdomain(z) + mapped_z = np.polynomial.polyutils.mapdomain( + z, z_domain, np.array([-1., 1.]) + ) + X, Z = np.meshgrid(x, z) + y = np.zeros_like(x) + for i in range(coef.shape[0]): + for j in range(coef.shape[1]): + y = y + coef[i, j] * X**i * Z**j + y_flat = y.ravel() + + vandermonde = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z), + (coef.shape[0] - 1, coef.shape[1] - 1) + ).reshape((-1, (coef.shape[0]) * (coef.shape[1]))) + + calc_coef = np.linalg.pinv(vandermonde) @ (y_flat) + calc_y = vandermonde @ calc_coef # corresponds to mapped domain + calc_coef = calc_coef.reshape(coef.shape) + + # sanity check; use slightly higher atol than other checks since + # the fit can potentially be off by a bit + assert_allclose(calc_y, y_flat, rtol=1e-10, atol=1e-6) + + converted_coef = utils._convert_coef2d(calc_coef, x_domain, z_domain) + + mapped_X, mapped_Z = np.meshgrid(mapped_x, mapped_z) + mapped_polynomial = np.polynomial.polynomial.polyval2d(mapped_X, mapped_Z, calc_coef) + + original_polynomial = np.polynomial.polynomial.polyval2d(X, Z, converted_coef) + + # sanity check that polyval2d recreates with the mapped coefficients + assert_allclose(mapped_polynomial, calc_y.reshape(y.shape), rtol=1e-10, atol=1e-14) + + assert_allclose(original_polynomial, mapped_polynomial, rtol=1e-10, atol=1e-14) + assert_allclose(converted_coef, coef, rtol=1e-10, atol=1e-12) + + @pytest.mark.parametrize('diff_order', (0, 1, 2, 3, 4, 5)) def test_difference_matrix(diff_order): """Tests common differential matrices.""" From 84e4212cd85bb8ddfe7df46fd9b069f436fb6ada Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 2 Jan 2024 17:32:22 -0500 Subject: [PATCH 12/56] OTHER: Only retain sort order if needed Baseline objects will no longer apply sorting when the input x-values were already in the correct sort order. --- pybaselines/_algorithm_setup.py | 10 ++++++---- pybaselines/utils.py | 27 +++++++++++++++++++++++++++ tests/test_algorithm_setup.py | 2 +- tests/test_utils.py | 18 ++++++++++++++++++ 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 8b25ca0..2d04bcf 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -25,7 +25,9 @@ from ._validation import ( _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays ) -from .utils import ParameterWarning, _inverted_sort, optimize_window, pad_edges +from .utils import ( + ParameterWarning, _determine_sorts, _inverted_sort, optimize_window, pad_edges +) class _Algorithm: @@ -97,9 +99,9 @@ def __init__(self, x_data=None, check_finite=True, assume_sorted=False, self._sort_order = None self._inverted_order = None else: - self._sort_order = self.x.argsort(kind='mergesort') - self.x = self.x[self._sort_order] - self._inverted_order = _inverted_sort(self._sort_order) + self._sort_order, self._inverted_order = _determine_sorts(self.x) + if self._sort_order is not None: + self.x = self.x[self._sort_order] self.whittaker_system = None self.vandermonde = None diff --git a/pybaselines/utils.py b/pybaselines/utils.py index eac321d..5ee872e 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -589,6 +589,33 @@ def _inverted_sort(sort_order): return inverted_order +def _determine_sorts(data): + """ + Provides the arrays for sorting and inverting sorting, if needed. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The array to potentially sort. + + Returns + ------- + output : tuple(numpy.ndarray, numpy.ndarray) or tuple(None, None) + A tuple of the index array for sorting the input array and the array + that inverts that sorting. If the input array is already sorted, then + the output will be (None, None). + + """ + sort_order = data.argsort(kind='mergesort') + skip_sorting = (sort_order[1:] > sort_order[:-1]).all() + if skip_sorting: + output = (None, None) + else: + output = (sort_order, _inverted_sort(sort_order)) + + return output + + def whittaker_smooth(data, lam=1e6, diff_order=2, weights=None, check_finite=True): """ Smooths the input data using Whittaker smoothing. diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index c196611..87065df 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -466,7 +466,7 @@ def test_algorithm_class_init(input_x, check_finite, assume_sorted, output_dtype else: assert algorithm._len is None - if not assume_sorted and input_x: + if not assume_sorted and change_order and input_x: order = np.arange(len(x)) if change_order: order[sort_order] = order[sort_order][::-1] diff --git a/tests/test_utils.py b/tests/test_utils.py index f647032..98758cc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -512,6 +512,24 @@ def test_invert_sort(seed): assert_array_equal(values, values[sort_order][inverted_order]) +@pytest.mark.parametrize('needs_sorting', (True, False)) +def test_determine_sorts(needs_sorting): + """Ensures the sort and inverted sort determinations work.""" + data = np.linspace(-1, 1, 20) + original_data = data.copy() + if needs_sorting: + data[5:10] = data[5:10][::-1] + + sort_order, inverted_order = utils._determine_sorts(data) + if not needs_sorting: + assert sort_order is None + assert inverted_order is None + else: + assert_array_equal(data[sort_order], original_data) + assert_array_equal(sort_order, data.argsort(kind='mergesort')) + assert_array_equal(data[sort_order][inverted_order], data) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) def test_whittaker_smooth(data_fixture, diff_order): """Ensures the Whittaker smoothing function performs correctly.""" From 42d74bbee7cc2b58d2888c2d722028a5ec2b06a7 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 6 Jan 2024 17:55:55 -0500 Subject: [PATCH 13/56] MAINT: Moved _sort_array to utils and added tests Also removed the `axis` keyword for several functions since the data is always assumed to be in the last dimension. --- pybaselines/_algorithm_setup.py | 67 +++++---------------------------- pybaselines/morphological.py | 4 +- pybaselines/optimizers.py | 4 +- pybaselines/spline.py | 6 +-- pybaselines/utils.py | 43 +++++++++++++++++++++ tests/test_utils.py | 23 +++++++++++ 6 files changed, 83 insertions(+), 64 deletions(-) diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 2d04bcf..5d7ee17 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -26,7 +26,7 @@ _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays ) from .utils import ( - ParameterWarning, _determine_sorts, _inverted_sort, optimize_window, pad_edges + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array, optimize_window, pad_edges ) @@ -140,7 +140,7 @@ def pentapy_solver(self, value): self.whittaker_system.pentapy_solver = value self._pentapy_solver = value - def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_sorting=False): + def _return_results(self, baseline, params, dtype, sort_keys=(), skip_sorting=False): """ Re-orders the input baseline and parameters based on the x ordering. @@ -157,8 +157,6 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need re-ordering. Default is (). - axis : int, optional - The axis of the input which defines each unique set of data. Default is -1. skip_sorting : bool, optional If True, will skip sorting the output baseline. The keys in `sort_keys` will still be sorted. Default is False. @@ -177,7 +175,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s # assumes params all all just one dimensional arrays params[key] = params[key][self._inverted_order] if not skip_sorting: - baseline = _sort_array(baseline, sort_order=self._inverted_order, axis=axis) + baseline = _sort_array(baseline, sort_order=self._inverted_order) baseline = baseline.astype(dtype, copy=False) @@ -185,7 +183,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s @classmethod def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d=True, - axis=-1, skip_sorting=False): + skip_sorting=False): """ Wraps a baseline function to validate inputs and correct outputs. @@ -208,8 +206,6 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d ensure_1d : bool, optional If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - axis : int, optional - The axis of the input on which to check its length. Default is -1. skip_sorting : bool, optional If True, will skip sorting the inputs and outputs, which is useful for algorithms that use other algorithms so that sorting is already internally done. Default is False. @@ -225,7 +221,7 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d if func is None: return partial( cls._register, sort_keys=sort_keys, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis, skip_sorting=skip_sorting + ensure_1d=ensure_1d, skip_sorting=skip_sorting ) @wraps(func) @@ -237,16 +233,16 @@ def inner(self, data=None, *args, **kwargs): input_y = True y, self.x = _yx_arrays( data, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis + ensure_1d=ensure_1d ) - self._len = y.shape[axis] + self._len = y.shape[-1] else: reset_x = True if data is not None: input_y = True y = _check_sized_array( data, self._len, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis, name='data' + ensure_1d=ensure_1d, name='data' ) else: y = data @@ -258,7 +254,7 @@ def inner(self, data=None, *args, **kwargs): ) if input_y and not skip_sorting: - y = _sort_array(y, sort_order=self._sort_order, axis=axis) + y = _sort_array(y, sort_order=self._sort_order) if input_y and self._dtype is None: output_dtype = y.dtype @@ -269,9 +265,7 @@ def inner(self, data=None, *args, **kwargs): if reset_x: self.x = np.array(self.x, dtype=x_dtype, copy=False) - return self._return_results( - baseline, params, output_dtype, sort_keys, axis, skip_sorting - ) + return self._return_results(baseline, params, output_dtype, sort_keys, skip_sorting) return inner @@ -815,47 +809,6 @@ def _setup_misc(self, y): return y -def _sort_array(array, sort_order=None, axis=-1): - """ - Sorts the input array only if given a non-None sorting order. - - Parameters - ---------- - array : numpy.ndarray - The array to sort. - sort_order : numpy.ndarray, optional - The array defining the sort order for the input array. Default is None, which - will not sort the input. - axis : int, optional - The axis of the input which defines each unique set of data. Default is -1. - - Returns - ------- - output : numpy.ndarray - The input array after optionally sorting. - - Raises - ------ - ValueError - Raised if the input array has more than two dimensions. - - """ - if sort_order is None: - output = array - else: - n_dims = array.ndim - if n_dims == 1: - output = array[sort_order] - elif n_dims == 2: - axes = [..., ...] - axes[axis] = sort_order - output = array[tuple(axes)] - else: - raise ValueError('too many dimensions to sort the data') - - return output - - def _class_wrapper(klass): """ Wraps a function to call the corresponding class method instead. diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index fc2a6ab..95449d0 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -9,10 +9,10 @@ import numpy as np from scipy.ndimage import grey_closing, grey_dilation, grey_erosion, grey_opening, uniform_filter1d -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._validation import _check_lam from .utils import ( - _mollifier_kernel, pad_edges, padded_convolve, relative_difference + _mollifier_kernel, _sort_array, pad_edges, padded_convolve, relative_difference ) diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index 74ad4b8..c7d51c4 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -14,9 +14,9 @@ import numpy as np from . import classification, misc, morphological, polynomial, smooth, spline, whittaker -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._validation import _check_optional_array -from .utils import _check_scalar, _get_edges, gaussian, whittaker_smooth +from .utils import _check_scalar, _get_edges, _sort_array, gaussian, whittaker_smooth class _Optimizers(_Algorithm): diff --git a/pybaselines/spline.py b/pybaselines/spline.py index c24b029..2359419 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -16,14 +16,14 @@ from scipy.sparse import spdiags from . import _weighting -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._banded_utils import _add_diagonals, _shift_rows, diff_penalty_diagonals from ._compat import _HAS_NUMBA, jit, trapezoid from ._spline_utils import _basis_midpoints from ._validation import _check_lam, _check_optional_array from .utils import ( - _MIN_FLOAT, _mollifier_kernel, ParameterWarning, gaussian, pad_edges, padded_convolve, - relative_difference + _MIN_FLOAT, _mollifier_kernel, _sort_array, ParameterWarning, gaussian, pad_edges, + padded_convolve, relative_difference ) diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 5ee872e..c5d738e 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -616,6 +616,49 @@ def _determine_sorts(data): return output + + +def _sort_array(array, sort_order=None): + """ + Sorts the input array only if given a non-None sorting order. + + Parameters + ---------- + array : numpy.ndarray + The array to sort. + sort_order : numpy.ndarray, optional + The array defining the sort order for the input array. Default is None, which + will not sort the input. + + Returns + ------- + output : numpy.ndarray + The input array after optionally sorting. + + Notes + ----- + For all inputs, assumes the last axis corresponds to the data that needs sorted. + + Raises + ------ + ValueError + Raised if the input array has more than two dimensions. + + """ + if sort_order is None: + output = array + else: + n_dims = array.ndim + if n_dims == 1: + output = array[sort_order] + elif n_dims == 2: + output = array[:, sort_order] + else: + raise ValueError('too many dimensions to sort the data') + + return output + + def whittaker_smooth(data, lam=1e6, diff_order=2, weights=None, check_finite=True): """ Smooths the input data using Whittaker smoothing. diff --git a/tests/test_utils.py b/tests/test_utils.py index 98758cc..ebea815 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -530,6 +530,29 @@ def test_determine_sorts(needs_sorting): assert_array_equal(data[sort_order][inverted_order], data) +@pytest.mark.parametrize('two_d', (True, False)) +def test_sort_array_none(two_d): + """Tests the case where the sorting array is None, which should skip sorting.""" + data = np.linspace(-1, 1, 20) + if two_d: + data = data[None, :] + + assert_allclose(data, utils._sort_array(data, sort_order=None), atol=0, rtol=1e-14) + + +@pytest.mark.parametrize('two_d', (True, False)) +def test_sort_array(two_d): + """Ensures array sorting works with 1d arrays.""" + data = np.linspace(-1, 1, 20) + reversed_data = data[::-1] + sort_order = np.arange(len(data))[::-1] + if two_d: + data = np.array([data, data]) + reversed_data = np.array([reversed_data, reversed_data]) + + assert_allclose(data, utils._sort_array(reversed_data, sort_order), atol=0, rtol=1e-14) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) def test_whittaker_smooth(data_fixture, diff_order): """Ensures the Whittaker smoothing function performs correctly.""" From 451e447ca36b26ba056b5a7066349723f0e50813 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 6 Jan 2024 18:03:52 -0500 Subject: [PATCH 14/56] MAINT: Finished code for initializing Baseline2D Addressed initializing Baseline2D given optional x and z, sorting in 2d is finally supported using utils._sort_array2d., and the output parameters are now correctly reshaped and sorted. --- pybaselines/_validation.py | 80 +++++++++- pybaselines/two_d/_algorithm_setup.py | 212 +++++++++++++++++--------- pybaselines/two_d/api.py | 17 ++- pybaselines/utils.py | 45 ++++++ tests/test_utils.py | 53 +++++++ 5 files changed, 332 insertions(+), 75 deletions(-) diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index 13d3902..ff4db54 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -112,7 +112,8 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr return output -def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True): +def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True, + ensure_2d=False): """ Validates the shape and values of the input array and controls the output parameters. @@ -161,6 +162,15 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr output = output.reshape(-1) elif dimensions != 1: raise ValueError('must be a one dimensional array') + elif ensure_2d: + output = np.array(output, copy=False, ndmin=2) + dimensions = output.ndim + if dimensions == 3 and 1 in output.shape: + output_shape = np.array(output.shape) + flat_dims = ~np.equal(output_shape, 1) + output = output.reshape(output_shape[flat_dims]).shape + elif dimensions != 2: + raise ValueError('must be a two dimensional array') return output @@ -206,7 +216,7 @@ def _check_sized_array(array, length, dtype=None, order=None, check_finite=False output = _check_array( array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d ) - if output.shape[axis] != length: + if not np.equal(output.shape[axis], length).all(): raise ValueError( f'length mismatch for {name}; expected {length} but got {output.shape[axis]}' ) @@ -267,6 +277,72 @@ def _yx_arrays(data, x_data=None, check_finite=False, dtype=None, order=None, en return y, x +def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, order=None, + ensure_2d=True, x_axis=-1, z_axis=-2): + """ + Converts input data into numpy arrays and provides x and z data if none are given. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data, with N data points. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1. to 1. with N points. + z_data : array-like, shape (M,), optional + The z-values of the measured data. Default is None, which will create an + array from -1. to 1. with N points. + check_finite : bool, optional + If True, will raise an error if any values if `array` are not finite. Default is False, + which skips the check. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + + Returns + ------- + y : numpy.ndarray, shape (M, N) + A numpy array of the y-values of the measured data. + x : numpy.ndarray, shape (N,) + A numpy array of the x-values of the measured data, or a created array. + z : numpy.ndarray, shape (M,) + A numpy array of the z-values of the measured data, or a created array. + + Notes + ----- + Does not change the scale/domain of the input `x_data` or `z_data` if they + are given, only converts them to arrays. + + """ + y = _check_array( + data, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=False, + ensure_2d=ensure_2d + ) + x_len = y.shape[x_axis] + z_len = y.shape[z_axis] + if x_data is None: + x = np.linspace(-1, 1, x_len) + else: + x = _check_sized_array( + x_data, x_len, dtype=dtype, order=order, check_finite=check_finite, + ensure_1d=True, axis=0, name='x_data' + ) + if z_data is None: + z = np.linspace(-1, 1, z_len) + else: + z = _check_sized_array( + z_data, z_len, dtype=dtype, order=order, check_finite=check_finite, + ensure_1d=True, axis=0, name='z_data' + ) + + return y, x, z + + def _check_lam(lam, allow_zero=False): """ Ensures the regularization parameter `lam` is a scalar greater than 0. diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 9ecad1c..493baca 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -14,10 +14,14 @@ import numpy as np from scipy.ndimage import grey_opening -from ..utils import ParameterWarning, _inverted_sort, pad_edges, relative_difference +from ..utils import ( + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, pad_edges, + relative_difference +) from ._spline_utils import PSpline2D from .._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_scalar_variable + _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_scalar_variable, + _check_sized_array, _yxz_arrays ) from ._whittaker_utils import PenalizedSystem2D @@ -34,23 +38,30 @@ class _Algorithm2D: poly_order : int The last polynomial order used for a polynomial algorithm. Initially is -1, denoting that no polynomial fitting has been performed. - pspline : PSpline or None - The PSpline object for setting up and solving penalized spline algorithms. Is None + pspline : PSpline2D or None + The PSpline2D object for setting up and solving penalized spline algorithms. Is None if no penalized spline setup has been performed (typically done in :meth:`._setup_spline`). vandermonde : numpy.ndarray or None The Vandermonde matrix for solving polynomial equations. Is None if no polynomial setup has been performed (typically done in :meth:`._setup_polynomial`). - whittaker_system : PenalizedSystem or None - The PenalizedSystem object for setting up and solving Whittaker-smoothing-based + whittaker_system : PenalizedSystem2D or None + The PenalizedSystem2D object for setting up and solving Whittaker-smoothing-based algorithms. Is None if no Whittaker setup has been performed (typically done in :meth:`_setup_whittaker`). x : numpy.ndarray or None The x-values for the object. If initialized with None, then `x` is initialized the - first function call to have the same length as the input `data` and has min and max - values of -1 and 1, respectively. + first function call to have the same size as the input `data.shape[-1]` and has min + and max values of -1 and 1, respectively. x_domain : numpy.ndarray The minimum and maximum values of `x`. If `x_data` is None during initialization, then set to numpy.ndarray([-1, 1]). + z : numpy.ndarray or None + The z-values for the object. If initialized with None, then `z` is initialized the + first function call to have the same size as the input `data.shape[-2]` and has min + and max values of -1 and 1, respectively. + z_domain : numpy.ndarray + The minimum and maximum values of `z`. If `z_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). """ @@ -64,7 +75,7 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non The x-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. - z_data : array-like, shape (N,), optional + z_data : array-like, shape (M,), optional The z-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. @@ -76,12 +87,10 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non The dtype to cast the output array. Default is None, which uses the typing of the input data. - Notes - ----- - Unlike `_Algorithm`, `_2DAlgorithm` does not sort input data. - """ self._len = [None, None] + x_sort_order = None + z_sort_order = None if x_data is None: self.x = None self.x_domain = np.array([-1., 1.]) @@ -89,6 +98,9 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non self.x = _check_array(x_data, check_finite=check_finite) self._len[1] = len(self.x) self.x_domain = np.polynomial.polyutils.getdomain(self.x) + x_sort_order, x_inverted_order = _determine_sorts(self.x) + if x_sort_order is not None: + self.x = self.x[x_sort_order] if z_data is None: self.z = None @@ -97,6 +109,22 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non self.z = _check_array(z_data, check_finite=check_finite) self._len[0] = len(self.z) self.z_domain = np.polynomial.polyutils.getdomain(self.z) + z_sort_order, z_inverted_order = _determine_sorts(self.z) + if z_sort_order is not None: + self.z = self.z[z_sort_order] + + if x_sort_order is None and z_sort_order is None: + self._sort_order = None + self._inverted_order = None + elif x_sort_order is None: + self._sort_order = z_sort_order + self._inverted_order = z_inverted_order + elif z_sort_order is None: + self._sort_order = (..., x_sort_order) + self._inverted_order = (..., x_inverted_order) + else: + self._sort_order = (z_sort_order[:, None], x_sort_order[None, :]) + self._inverted_order = (z_inverted_order[:, None], x_inverted_order[None, :]) self.whittaker_system = None self.vandermonde = None @@ -105,7 +133,8 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non self._check_finite = check_finite self._dtype = output_dtype - def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1): + def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False, + reshape_baseline=False, reshape_keys=()): """ Re-orders the input baseline and parameters based on the x ordering. @@ -113,7 +142,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1): Parameters ---------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The baseline output by the baseline function. params : dict The parameter dictionary output by the baseline function. @@ -122,24 +151,39 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1): sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need re-ordering. Default is (). - axis : int, optional - The axis of the input which defines each unique set of data. Default is -1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The input `baseline` after re-ordering and setting to the desired dtype. params : dict The input `params` after re-ordering the values for `sort_keys`. """ + if reshape_baseline: + if ensure_2d: + baseline = baseline.reshape(self._len) + else: + baseline = baseline.reshape(-1, *self._len) + for key in reshape_keys: + if key in params: + # TODO can any params be non-2d that need reshaped? + params[key] = params[key].reshape(self._len) + + if self._sort_order is not None: + for key in sort_keys: + if key in params: # some parameters are conditionally output + # assumes params all all two dimensional arrays + params[key] = params[key][self._inverted_order] + + baseline = _sort_array2d(baseline, sort_order=self._inverted_order) baseline = baseline.astype(dtype, copy=False) return baseline, params @classmethod - def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d=True, - axis=-1, reshape_baseline=False, reshape_keys=()): + def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d=True, + reshape_baseline=False, reshape_keys=()): """ Wraps a baseline function to validate inputs and correct outputs. @@ -159,11 +203,10 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d order : {None, 'C', 'F'}, optional The order for the output array. Default is None, which will use the default array ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. - ensure_1d : bool, optional - If True (default), will raise an error if the shape of `array` is not a one dimensional - array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - axis : int, optional - The axis of the input on which to check its length. Default is -1. + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), + or (1, M, N). reshape_baseline : bool, optional If True, will reshape the output baseline back into the shape of the input data. If False (default), will not modify the output baseline shape. @@ -182,55 +225,73 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d """ if func is None: return partial( - cls._register, dtype=dtype, order=order, ensure_1d=ensure_1d, axis=axis, + cls._register, dtype=dtype, order=order, ensure_2d=ensure_2d, reshape_baseline=reshape_baseline, reshape_keys=reshape_keys ) @wraps(func) def inner(self, data=None, *args, **kwargs): - """ - if self.x is None: - if data is None: - raise TypeError('"data" and "x_data" cannot both be None') - reset_x = False - input_y = True - y, self.x = _yx_arrays( - data, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=False, axis=axis + if data is None: + # not implementing interp_pts for 2D, so data can never + # be None in 2D + raise TypeError('"data" cannot be None') + + reset_x = self.x is not None + reset_z = self.z is not None + if reset_x or reset_z: + if reset_x and reset_z: + expected_shape = self._len + axis = slice(-2, None) + elif reset_x: + expected_shape = self._len[1] + axis = -1 + else: + expected_shape = self._len[0] + axis = -2 + y = _check_sized_array( + data, expected_shape, check_finite=self._check_finite, dtype=dtype, + order=order, ensure_1d=False, axis=axis, name='data' ) - self._len = y.shape[axis] else: - reset_x = True - if data is not None: - input_y = True - y = _check_sized_array( - data, self._len, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=False, axis=axis, name='data' - ) - else: - y = data - input_y = False - # update self.x just to ensure dtype and order are correct + y, self.x, self.z = _yxz_arrays( + data, self.x, self.z, check_finite=self._check_finite, dtype=dtype, + order=order, ensure_2d=ensure_2d + ) + + # update self.x and/or self.z just to ensure dtype and order are correct + if reset_x: x_dtype = self.x.dtype self.x = _check_array( self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False ) - """ - y = data; input_y = True; reset_x = False; x_dtype = None # TODO remove later + else: + self._len[1] = y.shape[-1] + self.x = np.linspace(-1, 1, self._len[1]) + if reset_z: + z_dtype = self.z.dtype + self.z = _check_array( + self.z, dtype=dtype, order=order, check_finite=False, ensure_1d=False + ) + else: + self._len[0] = y.shape[-2] + self.z = np.linspace(-1, 1, self._len[0]) - if input_y and self._dtype is None: + y = _sort_array2d(y, sort_order=self._sort_order) + if self._dtype is None: output_dtype = y.dtype else: output_dtype = self._dtype - y_shape = y.shape # TODO remove later and move somewhere else baseline, params = func(self, y, *args, **kwargs) - if reshape_baseline: - baseline = baseline.reshape(y_shape) if reset_x: self.x = np.array(self.x, dtype=x_dtype, copy=False) + if reset_z: + self.z = np.array(self.z, dtype=z_dtype, copy=False) - return self._return_results(baseline, params, output_dtype, axis) + return self._return_results( + baseline, params, output_dtype, sort_keys, ensure_2d, + reshape_baseline, reshape_keys + ) return inner @@ -254,6 +315,8 @@ def _override_x(self, new_x, new_sort_order=None): The _Algorithm object with the new x attribute. """ + raise NotImplementedError + old_x = self.x old_len = self._len old_x_domain = self.x_domain @@ -352,11 +415,12 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ParameterWarning, stacklevel=2 ) weight_array = _check_optional_array( - self._len, weights, copy_input=copy_weights, check_finite=self._check_finite - ).ravel() - #if self._sort_order is not None and weights is not None: - # weight_array = weight_array[self._sort_order] - + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + weight_array = weight_array.ravel() if self.whittaker_system is not None: self.whittaker_system.reset_diagonals(lam, diff_order, allow_lower, reverse_diags) else: @@ -409,14 +473,27 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ValueError Raised if `calc_pinv` is True and `calc_vander` is False. + Notes + ----- + Implementation note: the polynomial coefficients, `c`, from solving 2D polynomials + using ``Ac=b`` where `A` is the flattened Vandermonde and `b` is the flattened data + corresponds to the matrix below: + + np.array([ + [x^0*z^0, x^0*z^1, ..., x^0*z^n], + [x^1*z^0, x^1*z^1, ..., x^1*z^n], + [...], + [x^m*z^0, x^m*z^1, ..., x^m*z^n] + ]).flatten() + """ weight_array = _check_optional_array( - y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] weight_array = weight_array.ravel() - # TODO - #if self._sort_order is not None and weights is not None: - # weight_array = weight_array[self._sort_order] poly_orders = _check_scalar(poly_order, 2, True)[0] if max_cross is not None: max_cross = _check_scalar_variable( @@ -522,12 +599,11 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, """ weight_array = _check_optional_array( - y.shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False # TODO change y.shape to self._len or self._shape + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False ) - weight_array = weight_array - # TODO - #if self._sort_order is not None and weights is not None: - # weight_array = weight_array[self._sort_order] + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] diff_order = _check_scalar(diff_order, 2, True)[0] if make_basis: if (diff_order > 4).any(): diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index 57d9a19..94e2644 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -28,7 +28,7 @@ class Baseline2D( The x-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. - z_data : array-like, shape (L,), optional + z_data : array-like, shape (M,), optional The z-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. @@ -45,21 +45,28 @@ class Baseline2D( poly_order : int The last polynomial order used for a polynomial algorithm. Initially is -1, denoting that no polynomial fitting has been performed. - pspline : pybaselines._spline_utils.PSpline or None + pspline : pybaselines.two_d._spline_utils.PSpline2D or None The PSpline object for setting up and solving penalized spline algorithms. Is None if no penalized spline setup has been performed. vandermonde : numpy.ndarray or None The Vandermonde matrix for solving polynomial equations. Is None if no polynomial setup has been performed. - whittaker_system : pybaselines._banded_utils.PenalizedSystem or None + whittaker_system : pybaselines.two_d._banded_utils.PenalizedSystem2D or None The PenalizedSystem object for setting up and solving Whittaker-smoothing-based algorithms. Is None if no Whittaker setup has been performed. x : numpy.ndarray or None The x-values for the object. If initialized with None, then `x` is initialized the - first function call to have the same length as the input `data` and has min and max - values of -1 and 1, respectively. + first function call to have the same size as the input `data.shape[-1]` and has min + and max values of -1 and 1, respectively. x_domain : numpy.ndarray The minimum and maximum values of `x`. If `x_data` is None during initialization, then set to numpy.ndarray([-1, 1]). + z : numpy.ndarray or None + The z-values for the object. If initialized with None, then `z` is initialized the + first function call to have the same size as the input `data.shape[-2]` and has min + and max values of -1 and 1, respectively. + z_domain : numpy.ndarray + The minimum and maximum values of `z`. If `z_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). """ diff --git a/pybaselines/utils.py b/pybaselines/utils.py index c5d738e..7e21b83 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -616,6 +616,51 @@ def _determine_sorts(data): return output +def _sort_array2d(array, sort_order=None): + """ + Sorts the input 2D array only if given a non-None sorting order. + + Parameters + ---------- + array : numpy.ndarray + The array to sort. Must be two or three dimensional. + sort_order : numpy.ndarray, optional + The array(s) defining the sort order for the input array. Default is None, which + will not sort the input. + + Returns + ------- + output : numpy.ndarray + The input array after optionally sorting. + + Notes + ----- + For all inputs, assumes the last 2 axes correspond to the data that needs sorted. + + Raises + ------ + ValueError + Raised if the input array is not two or three dimensional. + + """ + if sort_order is None: + output = array + else: + n_dims = array.ndim + if n_dims == 2: + output = array[sort_order] + elif n_dims == 3: + if isinstance(sort_order, tuple): + if sort_order[0] is Ellipsis: + output = array[sort_order] + else: + output = array[:, sort_order[0], sort_order[1]] + else: + output = array[:, sort_order, :] + else: + raise ValueError('too many dimensions to sort the data') + + return output def _sort_array(array, sort_order=None): diff --git a/tests/test_utils.py b/tests/test_utils.py index ebea815..6f22b4f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -553,6 +553,59 @@ def test_sort_array(two_d): assert_allclose(data, utils._sort_array(reversed_data, sort_order), atol=0, rtol=1e-14) +@pytest.mark.parametrize('three_d', (True, False)) +def test_sort_array2d_none(three_d): + """Tests the case where the sorting array is None, which should skip sorting.""" + data = np.linspace(-1, 1, 20).reshape(5, 4) + if three_d: + data = data[None, :] + + assert_allclose(data, utils._sort_array2d(data, sort_order=None), atol=0, rtol=1e-14) + + +@pytest.mark.parametrize('sort_x', (True, False, None)) +@pytest.mark.parametrize('three_d', (True, False)) +def test_sort_array2d(three_d, sort_x): + """ + Ensures sorting for 2d data works. + + Each of the three `sort_x` cases corresponds to how _Algorithm2D will make its _sort_order + attribute if given only x, only z, and both x and z, respectively. + """ + x = np.linspace(-1, 1, 20) + z = np.linspace(-2, 2, 30) + x_sort_order = np.arange(len(x)) + z_sort_order = np.arange(len(z)) + + X, Z = np.meshgrid(x, z) + data = X + 2 * Z + + if sort_x is None: # sort both x and z, so reverse both x and z + x2 = x[::-1] + x_sort_order = x_sort_order[::-1] + z2 = z[::-1] + z_sort_order = z_sort_order[::-1] + sort_order = (z_sort_order[:, None], x_sort_order[None, :]) + elif sort_x: # sort just x, so reverse just x + x2 = x[::-1] + x_sort_order = x_sort_order[::-1] + z2 = z + sort_order = (..., x_sort_order) + else: # sort just z, so reverse just z + x2 = x + z2 = z[::-1] + z_sort_order = z_sort_order[::-1] + sort_order = z_sort_order + + X2, Z2 = np.meshgrid(x2, z2) + reversed_data = X2 + 2 * Z2 + if three_d: + data = np.array([data, data]) + reversed_data = np.array([reversed_data, reversed_data]) + + assert_allclose(data, utils._sort_array2d(reversed_data, sort_order), atol=0, rtol=1e-14) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) def test_whittaker_smooth(data_fixture, diff_order): """Ensures the Whittaker smoothing function performs correctly.""" From dbd33e032532487420032d57bdb3090d43e78179 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 8 Jan 2024 20:50:48 -0500 Subject: [PATCH 15/56] TESTS: Finished tests for two_d algorithm_setup Fixed missing sorting output params and incorrect handling of input weights for 2D. --- pybaselines/_validation.py | 21 +- pybaselines/two_d/_algorithm_setup.py | 50 +- pybaselines/two_d/_spline_utils.py | 106 ++-- pybaselines/two_d/_whittaker_utils.py | 68 +-- pybaselines/utils.py | 37 ++ tests/conftest.py | 117 +++++ tests/test_algorithm_setup.py | 30 +- tests/test_utils.py | 2 +- tests/two_d/__init__.py | 2 + tests/two_d/test_algorithm_setup.py | 674 ++++++++++++++++++++++++++ 10 files changed, 987 insertions(+), 120 deletions(-) create mode 100644 tests/two_d/__init__.py create mode 100644 tests/two_d/test_algorithm_setup.py diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index ff4db54..cc238ed 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -388,7 +388,7 @@ def _check_lam(lam, allow_zero=False): return _check_scalar_variable(lam, allow_zero) -def _check_half_window(half_window, allow_zero=False): +def _check_half_window(half_window, allow_zero=False, two_d=False): """ Ensures the half-window is an integer and has an appropriate value. @@ -414,11 +414,20 @@ def _check_half_window(half_window, allow_zero=False): `half_window`. """ - output_half_window = _check_scalar_variable( - half_window, allow_zero, 'half_window', dtype=np.intp - ) - if output_half_window != half_window: - raise TypeError('half_window must be an integer') + if two_d: + output_half_window = _check_scalar( + half_window, 2, fill_scalar=True, dtype=np.intp + )[0] + for val in output_half_window: + _check_scalar_variable( + val, allow_zero, 'half_window' + ) + else: + output_half_window = _check_scalar_variable( + half_window, allow_zero, 'half_window', dtype=np.intp + ) + if output_half_window != half_window: + raise TypeError('half_window must be an integer') return output_half_window diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 493baca..f7e93ce 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -15,7 +15,7 @@ from scipy.ndimage import grey_opening from ..utils import ( - ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, pad_edges, + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, pad_edges2d, relative_difference ) from ._spline_utils import PSpline2D @@ -65,7 +65,8 @@ class _Algorithm2D: """ - def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=None): + def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=False, + output_dtype=None): """ Initializes the algorithm object. @@ -83,6 +84,10 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non If True (default), will raise an error if any values in input data are not finite. Setting to False will skip the check. Note that errors may occur if `check_finite` is False and the input data contains non-finite values. + assume_sorted : bool, optional + If False (default), will sort the input `x_data` and `z_data` values. Otherwise, + the input is assumed to be sorted. Note that some functions may raise an error + if `x_data` and `z_data` are not sorted. output_dtype : type or numpy.dtype, optional The dtype to cast the output array. Default is None, which uses the typing of the input data. @@ -98,9 +103,10 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non self.x = _check_array(x_data, check_finite=check_finite) self._len[1] = len(self.x) self.x_domain = np.polynomial.polyutils.getdomain(self.x) - x_sort_order, x_inverted_order = _determine_sorts(self.x) - if x_sort_order is not None: - self.x = self.x[x_sort_order] + if not assume_sorted: + x_sort_order, x_inverted_order = _determine_sorts(self.x) + if x_sort_order is not None: + self.x = self.x[x_sort_order] if z_data is None: self.z = None @@ -109,9 +115,10 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, output_dtype=Non self.z = _check_array(z_data, check_finite=check_finite) self._len[0] = len(self.z) self.z_domain = np.polynomial.polyutils.getdomain(self.z) - z_sort_order, z_inverted_order = _determine_sorts(self.z) - if z_sort_order is not None: - self.z = self.z[z_sort_order] + if not assume_sorted: + z_sort_order, z_inverted_order = _determine_sorts(self.z) + if z_sort_order is not None: + self.z = self.z[z_sort_order] if x_sort_order is None and z_sort_order is None: self._sort_order = None @@ -225,7 +232,7 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d """ if func is None: return partial( - cls._register, dtype=dtype, order=order, ensure_2d=ensure_2d, + cls._register, sort_keys=sort_keys, dtype=dtype, order=order, ensure_2d=ensure_2d, reshape_baseline=reshape_baseline, reshape_keys=reshape_keys ) @@ -289,8 +296,8 @@ def inner(self, data=None, *args, **kwargs): self.z = np.array(self.z, dtype=z_dtype, copy=False) return self._return_results( - baseline, params, output_dtype, sort_keys, ensure_2d, - reshape_baseline, reshape_keys + baseline, params, dtype=output_dtype, sort_keys=sort_keys, ensure_2d=ensure_2d, + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys ) return inner @@ -404,11 +411,12 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa Raised if `diff_order` is greater than 3. """ - if diff_order < 1: + diff_order = _check_scalar(diff_order, 2, True)[0] + if (diff_order < 1).any(): raise ValueError( 'the difference order must be > 0 for Whittaker-smoothing-based methods' ) - elif diff_order > 3: + elif (diff_order > 3).any(): warnings.warn( ('difference orders greater than 3 can have numerical issues;' ' consider using a difference order of 2 or 1 instead'), @@ -416,13 +424,13 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ) weight_array = _check_optional_array( self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, - ensure_1d=False + ensure_1d=False, axis=slice(None) ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] weight_array = weight_array.ravel() if self.whittaker_system is not None: - self.whittaker_system.reset_diagonals(lam, diff_order, allow_lower, reverse_diags) + self.whittaker_system.reset_diagonals(lam, diff_order) else: self.whittaker_system = PenalizedSystem2D( self._len, lam, diff_order @@ -489,7 +497,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, """ weight_array = _check_optional_array( self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, - ensure_1d=False + ensure_1d=False, axis=slice(None) ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] @@ -600,7 +608,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, """ weight_array = _check_optional_array( self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, - ensure_1d=False + ensure_1d=False, axis=slice(None) ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] @@ -618,7 +626,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, self.x, self.z, num_knots, spline_degree, self._check_finite, lam, diff_order ) else: - self.pspline.reset_penalty_diagonals(lam, diff_order) + self.pspline.reset_penalty(lam, diff_order) return y, weight_array @@ -671,7 +679,7 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): """ if half_window is not None: - output_half_window = _check_half_window(half_window) + output_half_window = _check_half_window(half_window, two_d=True) else: output_half_window = _optimize_window(y, **window_kwargs) @@ -703,8 +711,8 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): The padded array of data. """ - hw = _check_half_window(half_window, allow_zero) - return pad_edges(y, hw, **pad_kwargs) + hw = _check_half_window(half_window, allow_zero, two_d=False) + return pad_edges2d(y, hw, **pad_kwargs) def _setup_misc(self, y): """ diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index e192492..ee79cea 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -27,23 +27,33 @@ class PSpline2D: Attributes ---------- - basis : scipy.sparse.csr.csr_matrix, shape (N, M) - The spline basis. Has a shape of (`N,` `M`), where `N` is the number of points - in `x`, and `M` is the number of basis functions (equal to ``K - spline_degree - 1`` - or equivalently ``num_knots + spline_degree - 1``). + basis_x : scipy.sparse.csr.csr_matrix, shape (N, P) + The spline basis for x. Has a shape of (`N,` `P`), where `N` is the number of points + in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[0] + spline_degree[0] - 1``). + basis_z : scipy.sparse.csr.csr_matrix, shape (M, Q) + The spline basis for z. Has a shape of (`M,` `Q`), where `M` is the number of points + in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[1] + spline_degree[1] - 1``). coef : None or numpy.ndarray, shape (M,) The spline coefficients. Is None if :meth:`.solve_pspline` has not been called at least once. - knots : numpy.ndarray, shape (K,) + knots_x : numpy.ndarray, shape (K,) The knots for the spline. Has a shape of `K`, which is equal to - ``num_knots + 2 * spline_degree``. - num_knots : int - The number of internal knots (including the endpoints). The total number of knots - for the spline, `K`, is equal to ``num_knots + 2 * spline_degree``. - spline_degree : int - The degree of the spline (eg. a cubic spline would have a `spline_degree` of 3). + ``num_knots[0] + 2 * spline_degree[0]``. + knots_z : numpy.ndarray, shape (L,) + The knots for the spline. Has a shape of `L`, which is equal to + ``num_knots[1] + 2 * spline_degree[2]``. + num_knots : numpy.ndarray([int, int]) + The number of internal knots (including the endpoints) for x and z. The total number of + knots for the spline, `K`, is equal to ``num_knots + 2 * spline_degree``. + spline_degree : numpy.ndarray([int, int]) + The degree of the spline (eg. a cubic spline would have a `spline_degree` of 3) for + x and z. x : numpy.ndarray, shape (N,) The x-values for the spline. + z : numpy.ndarray, shape (M,) + The z-values for the spline. References ---------- @@ -61,7 +71,7 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam ---------- x : array-like, shape (N,) The x-values for the spline. - z : array-like, shape (L,) + z : array-like, shape (M,) The z-values for the spline. num_knots : int or Sequence(int, int), optional The number of internal knots for the spline, including the endpoints. @@ -87,41 +97,29 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam """ self.x = _check_array(x, dtype=float, check_finite=check_finite, ensure_1d=True) self.z = _check_array(z, dtype=float, check_finite=check_finite, ensure_1d=True) - self.shape = (len(x), len(z)) self.num_knots = _check_scalar(num_knots, 2, True)[0] - self.diff_order = _check_scalar(diff_order, 2, True)[0] self.spline_degree = _check_scalar(spline_degree, 2, True)[0] - self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - self.knots_1 = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) - self.basis_1 = _spline_basis(self.x, self.knots_1, self.spline_degree[0]) + if (self.spline_degree < 0).any(): + raise ValueError('spline degree must be >= 0') + elif (self.spline_degree < 0).any(): + raise ValueError('spline degree must be greater than or equal to 0') + + self.knots_x = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) + self.basis_x = _spline_basis(self.x, self.knots_x, self.spline_degree[0]) - self.knots_2 = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) - self.basis_2 = _spline_basis(self.z, self.knots_2, self.spline_degree[1]) - self._num_bases = np.array([self.basis_1.shape[1], self.basis_2.shape[1]]) + self.knots_z = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) + self.basis_z = _spline_basis(self.z, self.knots_z, self.spline_degree[1]) + self._num_bases = np.array((self.basis_x.shape[1], self.basis_z.shape[1])) el = np.ones((self._num_bases[0], 1)) ek = np.ones((self._num_bases[1], 1)) - self._G = sparse.kron(self.basis_1, el.T).multiply(sparse.kron(el.T, self.basis_1)) - self._G2 = sparse.kron(self.basis_2, ek.T).multiply(sparse.kron(ek.T, self.basis_2)) + self._G = sparse.kron(self.basis_x, el.T).multiply(sparse.kron(el.T, self.basis_x)) + self._G2 = sparse.kron(self.basis_z, ek.T).multiply(sparse.kron(ek.T, self.basis_z)) self.coef = None - - D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) - D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) - - P1 = self.lam[0] * sparse.kron(D1.T @ D1, sparse.identity(self._num_bases[1])) - P2 = self.lam[1] * sparse.kron(sparse.identity(self._num_bases[0]), D2.T @ D2) - self.penalty = P1 + P2 - - if (self.diff_order >= self._num_bases).any(): - raise ValueError(( - 'the difference order must be less than the number of basis ' - 'functions, which is the number of knots + spline degree - 1' - )) - elif (self.spline_degree < 0).any(): - raise ValueError('spline degree must be greater than or equal to 0') + self.reset_penalty(lam, diff_order) def same_basis(self, num_knots=100, spline_degree=3): """ @@ -141,11 +139,19 @@ def same_basis(self, num_knots=100, spline_degree=3): spline basis of the object. """ - return False # TODO will need to check both basis matrices + # TODO should give a way to update only one of the basis functions, which + # would also need to update the penalty + num_knots = _check_scalar(num_knots, 2, True)[0] + spline_degree = _check_scalar(spline_degree, 2, True)[0] + + return ( + np.array_equal(num_knots, self.num_knots) + and np.array_equal(spline_degree, self.spline_degree) + ) - def reset_penalty_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=False): + def reset_penalty(self, lam=1, diff_order=2): """ - Resets the penalty diagonals of the system and all of the attributes. + Resets the penalty of the system and all of the attributes. Useful for reusing the penalty diagonals without having to recalculate the spline basis. @@ -174,6 +180,22 @@ def reset_penalty_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse basis and the penalty to speed up calculations when the two are added. """ + self.diff_order = _check_scalar(diff_order, 2, True)[0] + self.lam = np.array([_check_lam(val) for val in _check_scalar(lam, 2, True)[0]]) + + if (self.diff_order < 1).any(): + raise ValueError('the difference order must be > 0 for penalized splines') + elif (self.diff_order >= self._num_bases).any(): + raise ValueError(( + 'the difference order must be less than the number of basis ' + 'functions, which is the number of knots + spline degree - 1' + )) + D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) + D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) + + P1 = self.lam[0] * sparse.kron(D1.T @ D1, sparse.identity(self._num_bases[1])) + P2 = self.lam[1] * sparse.kron(sparse.identity(self._num_bases[0]), D2.T @ D2) + self.penalty = P1 + P2 def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): """ @@ -225,10 +247,10 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): self.coef = spsolve( sparse.csr_matrix(F) + self.penalty, - (self.basis_2.T @ (weights * y) @ self.basis_1).flatten(), + (self.basis_z.T @ (weights * y) @ self.basis_x).flatten(), 'NATURAL' ).reshape(self._num_bases[1], self._num_bases[0]) - output = self.basis_2 @ self.coef @ self.basis_1.T + output = self.basis_z @ self.coef @ self.basis_x.T return output diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 8aee2cd..005c4e7 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -52,8 +52,7 @@ class PenalizedSystem2D: """ - def __init__(self, data_size, lam=1, diff_order=2, allow_lower=True, - reverse_diags=None, allow_pentapy=True, padding=0): + def __init__(self, data_size, lam=1, diff_order=2): """ Initializes the banded system. @@ -66,35 +65,10 @@ def __init__(self, data_size, lam=1, diff_order=2, allow_lower=True, smoother results. Must be greater than 0. Default is 1. diff_order : int, optional The difference order of the penalty. Default is 2 (second order difference). - allow_lower : bool, optional - If True (default), will allow only using the lower bands of the penalty matrix, - which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly - slower :func:`scipy.linalg.solve_banded`. - reverse_diags : {None, False, True}, optional - If True, will reverse the order of the diagonals of the squared difference - matrix. If False, will never reverse the diagonals. If None (default), will - only reverse the diagonals if using pentapy's solver. - allow_pentapy : bool, optional - If True (default), will allow using pentapy's solver if `diff_order` is 2 - and pentapy is installed. pentapy's solver is faster than scipy's banded solvers. - padding : int, optional - The number of extra layers of zeros to add to the bottom and potentially - the top if the full bands are used. Default is 0, which adds no extra - layers. Negative `padding` is treated as equivalent to 0. """ - self.shape = data_size - self.original_diagonals = None - - self.diff_order = _check_scalar(diff_order, 2, True)[0] - self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - D1 = difference_matrix(self.shape[0], self.diff_order[0]) - D2 = difference_matrix(self.shape[1], self.diff_order[1]) - - P1 = self.lam[0] * kron(D1.T @ D1, identity(self.shape[1])) - P2 = self.lam[1] * kron(identity(self.shape[0]), D2.T @ D2) - - self.penalty = P1 + P2 + self._num_bases = data_size + self.reset_penalty(lam, diff_order) def add_penalty(self, penalty): """ @@ -111,9 +85,9 @@ def add_penalty(self, penalty): The updated `self.penalty`. """ + raise NotImplementedError - def reset_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=None, - allow_pentapy=True, padding=0): + def reset_penalty(self, lam=1, diff_order=2): """ Resets the diagonals of the system and all of the attributes. @@ -126,26 +100,22 @@ def reset_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=N smoother results. Must be greater than 0. Default is 1. diff_order : int, optional The difference order of the penalty. Default is 2 (second order difference). - allow_lower : bool, optional - If True (default), will allow only using the lower bands of the penalty matrix, - which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly - slower :func:`scipy.linalg.solve_banded`. - reverse_diags : {None, False, True}, optional - If True, will reverse the order of the diagonals of the squared difference - matrix. If False, will never reverse the diagonals. If None (default), will - only reverse the diagonals if using pentapy's solver. - allow_pentapy : bool, optional - If True (default), will allow using pentapy's solver if `diff_order` is 2 - and pentapy is installed. pentapy's solver is faster than scipy's banded solvers. - padding : int, optional - The number of extra layers of zeros to add to the bottom and potentially - the top if the full bands are used. Default is 0, which adds no extra - layers. Negative `padding` is treated as equivalent to 0. """ + self.diff_order = _check_scalar(diff_order, 2, True)[0] + self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + + if (self.diff_order < 1).any(): + raise ValueError('the difference order must be > 0') + + D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) + D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) - def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, - check_finite=False, l_and_u=None, check_output=False): + P1 = self.lam[0] * kron(D1.T @ D1, identity(self._num_bases[1])) + P2 = self.lam[1] * kron(identity(self._num_bases[0]), D2.T @ D2) + self.penalty = P1 + P2 + + def solve(self, lhs, rhs): """ Solves the equation ``A @ x = rhs``, given `A` in banded format as `lhs`. @@ -196,6 +166,8 @@ def reverse_penalty(self): not make physical sense. """ + raise NotImplementedError + if self.lower: raise ValueError('cannot reverse diagonals when self.lower is True') self.penalty = self.penalty[::-1] diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 7e21b83..640309d 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -87,6 +87,36 @@ def gaussian(x, height=1.0, center=0.0, sigma=1.0): return height * np.exp(-0.5 * ((x - center)**2) / max(sigma, _MIN_FLOAT)**2) +def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0): + """ + Generates a Gaussian distribution based on height, center, and sigma. + + Parameters + ---------- + x : numpy.ndarray, shape (M, N) + The x-values at which to evaluate the distribution. + z : numpy.ndarray, shape (M, N) + The z-values at which to evaluate the distribution. + height : float, optional + The maximum height of the distribution. Default is 1.0. + center_x : float, optional + The center of the distribution in the x-axis. Default is 0.0. + sigma_x : float, optional + The standard deviation of the distribution in the x-axis. Default is 1.0. + center_z : float, optional + The center of the distribution in the z-axis. Default is 0.0. + sigma_z : float, optional + The standard deviation of the distribution in the z-axis. Default is 1.0. + + Returns + ------- + numpy.ndarray + The Gaussian distribution evaluated with x. + + """ + return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z) + + def gaussian_kernel(window_size, sigma=1.0): """ Creates an area-normalized gaussian kernel for convolution. @@ -278,6 +308,13 @@ def pad_edges(data, pad_length, mode='extrapolate', return padded_data +def pad_edges2d(data, pad_length, *args, **kwargs): + if not _check_scalar(pad_length, None)[1]: + raise NotImplementedError('separate pad lengths not yet supported') + else: + return pad_edges(data, pad_length, *args, **kwargs) + + def padded_convolve(data, kernel, mode='reflect', **pad_kwargs): """ Pads data before convolving to reduce edge effects. diff --git a/tests/conftest.py b/tests/conftest.py index 210e912..89924fd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,6 +54,42 @@ def gaussian(x, height=1.0, center=0.0, sigma=1.0): return height * np.exp(-0.5 * ((x - center)**2) / sigma**2) +def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0): + """ + Generates a Gaussian distribution based on height, center, and sigma. + + Parameters + ---------- + x : numpy.ndarray, shape (M, N) + The x-values at which to evaluate the distribution. + z : numpy.ndarray, shape (M, N) + The z-values at which to evaluate the distribution. + height : float, optional + The maximum height of the distribution. Default is 1.0. + center_x : float, optional + The center of the distribution in the x-axis. Default is 0.0. + sigma_x : float, optional + The standard deviation of the distribution in the x-axis. Default is 1.0. + center_z : float, optional + The center of the distribution in the z-axis. Default is 0.0. + sigma_z : float, optional + The standard deviation of the distribution in the z-axis. Default is 1.0. + + Returns + ------- + numpy.ndarray + The Gaussian distribution evaluated with x. + + Notes + ----- + This is the same code as in pybaselines.utils.gaussian, but + this removes the dependence on pybaselines so that if an error + with pybaselines occurs, this will be unaffected. + + """ + return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z) + + def get_data(include_noise=True, num_points=1000): """Creates x- and y-data for testing. @@ -87,18 +123,99 @@ def get_data(include_noise=True, num_points=1000): return x_data, y_data +def get_data2d(include_noise=True, num_points=(50, 60)): + """Creates x-, z-, and y-data for testing. + + Parameters + ---------- + include_noise : bool, optional + If True (default), will include noise with the y-data. + num_points : Container(int, int), optional + The number of data points to use for x, and z, respectively. Default + is (50, 60), which uses different numbers so that any issues caused + by not having a square matrix will be seen. + + Returns + ------- + x_data : numpy.ndarray + The x-values. + z_data : numpy.ndarray + The z-values + y_data : numpy.ndarray + The y-values. + + """ + # TODO use np.random.default_rng(0) once minimum numpy version is >= 1.17 + np.random.seed(0) + x_num_points, z_num_points = num_points + x_data = np.linspace(1, 100, x_num_points) + z_data = np.linspace(1, 100, z_num_points) + X, Z = np.meshgrid(x_data, z_data) + y_data = ( + 500 # constant baseline + + gaussian2d(X, Z, 10, 25, 25) + + gaussian2d(X, Z, 20, 50, 50) + + gaussian2d(X, Z, 10, 75, 75) + ) + if include_noise: + y_data += np.random.normal(0, 0.5, y_data.shape) + + return x_data, z_data, y_data + + +def get_2dspline_inputs(num_knots=5, spline_degree=3, lam=1, diff_order=2): + """Helper function to handle array-like values for simple cases in testing.""" + if isinstance(num_knots, int): + num_knots_x = num_knots + num_knots_z = num_knots + else: + num_knots_x, num_knots_z = num_knots + if isinstance(spline_degree, int): + spline_degree_x = spline_degree + spline_degree_z = spline_degree + else: + spline_degree_x, spline_degree_z = spline_degree + if isinstance(lam, (int, float)): + lam_x = lam + lam_z = lam + else: + lam_x, lam_z = lam + if isinstance(diff_order, int): + diff_order_x = diff_order + diff_order_z = diff_order + else: + diff_order_x, diff_order_z = diff_order + + return ( + num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) + + @pytest.fixture def small_data(): """A small array of data for testing.""" return np.arange(10, dtype=float) +@pytest.fixture +def small_data2d(): + """A small array of data for testing.""" + return np.arange(50, dtype=float).reshape(5, 10) + + @pytest.fixture() def data_fixture(): """Test fixture for creating x- and y-data for testing.""" return get_data() +@pytest.fixture() +def data_fixture2d(): + """Test fixture for creating x-, z-, and y-data for testing.""" + return get_data2d() + + @pytest.fixture() def no_noise_data_fixture(): """Test fixture that creates x- and y-data without noise for testing.""" diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index 87065df..e752622 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -349,6 +349,32 @@ def test_setup_spline_negative_lam_fails(small_data): ) +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_spline_weights(small_data, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones_like(small_data) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data) + desired_weights = weights.copy() + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data.shape[0]) + desired_weights = np.arange(small_data.shape[0]) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data.shape[0]).tolist() + desired_weights = np.arange(small_data.shape[0]) + + _, weight_array = algorithm._setup_spline(small_data, lam=1, diff_order=2, weights=weights) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + def test_setup_spline_array_lam(small_data): """Ensures a lam that is a single array passes while larger arrays fail.""" _algorithm_setup._Algorithm(np.arange(len(small_data)))._setup_spline(small_data, lam=[1]) @@ -468,8 +494,7 @@ def test_algorithm_class_init(input_x, check_finite, assume_sorted, output_dtype if not assume_sorted and change_order and input_x: order = np.arange(len(x)) - if change_order: - order[sort_order] = order[sort_order][::-1] + order[sort_order] = order[sort_order][::-1] assert_array_equal(algorithm._sort_order, order) assert_array_equal(algorithm._inverted_order, order.argsort()) else: @@ -548,6 +573,7 @@ class SubClass(_algorithm_setup._Algorithm): # 'a' values will be sorted and 'b' values will be kept the same @_algorithm_setup._Algorithm._register(sort_keys=('a',)) def func(self, data, *args, **kwargs): + """For checking sorting of output parameters.""" expected_x = np.arange(20) if change_order and assume_sorted: expected_x[sort_indices] = expected_x[sort_indices][::-1] diff --git a/tests/test_utils.py b/tests/test_utils.py index 6f22b4f..21bbba6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -410,7 +410,7 @@ def test_pad_edges_extrapolate_windows(): assert_allclose(output[-pad_len:], np.full(pad_len, 1.), 1e-14) -@pytest.mark.parametrize('extrapolate_window', (0, (0, 0), (5, 0), (5, -1))) +@pytest.mark.parametrize('extrapolate_window', (0, -2, (0, 0), (5, 0), (5, -1))) def test_pad_edges_extrapolate_zero_window(extrapolate_window): """Ensures an extrapolate_window <= 0 raises an exception.""" with pytest.raises(ValueError): diff --git a/tests/two_d/__init__.py b/tests/two_d/__init__.py new file mode 100644 index 0000000..0c8cac4 --- /dev/null +++ b/tests/two_d/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.""" diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py new file mode 100644 index 0000000..84a722c --- /dev/null +++ b/tests/two_d/test_algorithm_setup.py @@ -0,0 +1,674 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.two_d._algorithm_setup. + +@author: Donald Erb +Created on January 5, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.sparse import identity, kron + +from pybaselines.two_d import _algorithm_setup +from pybaselines.utils import ParameterWarning, difference_matrix + +from ..conftest import get_data2d, get_2dspline_inputs + + +@pytest.fixture +def algorithm(small_data2d): + """ + An _Algorithm2D class with x-data set to np.arange(10) and z-data set to np.arange(20). + + Returns + ------- + pybaselines.two_d._algorithm_setup._Algorithm2D + An _Algorithm2D class for testing. + """ + num_z, num_x = small_data2d.shape + return _algorithm_setup._Algorithm2D( + x_data=np.arange(num_x), z_data=np.arange(num_z), assume_sorted=True, check_finite=False + ) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('lam', (1, 20, (2, 5))) +def test_setup_whittaker_diff_matrix(data_fixture2d, lam, diff_order): + """Ensures output difference matrix diagonal data is in desired format.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + assert algorithm.whittaker_system is None + + _ = algorithm._setup_whittaker(y, lam=lam, diff_order=diff_order) + + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + D1 = difference_matrix(len(z), diff_order_x) + D2 = difference_matrix(len(x), diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(len(x))) + P2 = lam_z * kron(identity(len(z)), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + algorithm.whittaker_system.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_whittaker_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size) + + _, weight_array = algorithm._setup_whittaker( + small_data2d, lam=1, diff_order=2, weights=weights + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +def test_setup_whittaker_wrong_weight_shape(small_data2d, algorithm): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=2, weights=weights) + + +@pytest.mark.parametrize('diff_order', (0, -1)) +def test_setup_whittaker_diff_matrix_fails(small_data2d, algorithm, diff_order): + """Ensures using a diff_order < 1 with _setup_whittaker raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=diff_order) + + +@pytest.mark.parametrize('diff_order', (4, 5)) +def test_setup_whittaker_diff_matrix_warns(small_data2d, algorithm, diff_order): + """Ensures using a diff_order > 3 with _setup_whittaker raises a warning.""" + with pytest.warns(ParameterWarning): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=diff_order) + + +def test_setup_whittaker_negative_lam_fails(small_data2d, algorithm): + """Ensures a negative lam value fails.""" + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=-1) + + +def test_setup_whittaker_array_lam(small_data2d): + """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" + num_z, num_x = small_data2d.shape + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1] + ) + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1, 2] + ) + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1, 2, 3] + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_polynomial_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correctly handled.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size) + + _, weight_array = algorithm._setup_polynomial(small_data2d, weights=weights) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +def test_setup_polynomial_wrong_weight_shape(small_data2d, algorithm): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, weights=weights) + + +@pytest.mark.parametrize('poly_order', (2, 4, (2, 4))) +@pytest.mark.parametrize('vander_enum', (0, 1, 2, 3)) +@pytest.mark.parametrize('include_pinv', (True, False)) +def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, include_pinv, + poly_order): + """Ensures that the Vandermonde matrix and the pseudo-inverse matrix are correct.""" + if vander_enum == 0: + # no weights specified + weights = None + elif vander_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + elif vander_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + elif vander_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + + output = algorithm._setup_polynomial( + small_data2d, weights=weights, poly_order=poly_order, calc_vander=True, + calc_pinv=include_pinv + ) + if include_pinv: + _, weight_array, pinv_matrix = output + else: + _, weight_array = output + + if isinstance(poly_order, int): + x_order = poly_order + z_order = poly_order + else: + x_order, z_order = poly_order + + mapped_x = np.polynomial.polyutils.mapdomain(algorithm.x, algorithm.x_domain, [-1, 1]) + mapped_z = np.polynomial.polyutils.mapdomain(algorithm.z, algorithm.z_domain, [-1, 1]) + desired_vander = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z), (x_order, z_order) + ).reshape((-1, (x_order + 1) * (z_order + 1))) + assert_allclose(desired_vander, algorithm.vandermonde, 1e-12) + + if include_pinv: + desired_pinv = np.linalg.pinv(np.sqrt(weight_array)[:, np.newaxis] * desired_vander) + assert_allclose(desired_pinv, pinv_matrix, 1e-10) + + +def test_setup_smooth_shape(small_data2d, algorithm): + """Ensures output y is correctly padded.""" + pad_length = 4 + y = algorithm._setup_smooth(small_data2d, pad_length, mode='edge') + assert_array_equal( + y.shape, (small_data2d.shape[0] + 2 * pad_length, small_data2d.shape[1] + 2 * pad_length) + ) + + +@pytest.mark.parametrize('num_knots', (10, 30, (20, 30))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3))) +def test_setup_spline_spline_basis(data_fixture2d, num_knots, spline_degree): + """Ensures the spline basis function is correctly created.""" + x, z, y = data_fixture2d + fitter = _algorithm_setup._Algorithm2D(x, z) + assert fitter.pspline is None + + _ = fitter._setup_spline( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots + ) + + if isinstance(num_knots, int): + num_knots_x = num_knots + num_knots_z = num_knots + else: + num_knots_x, num_knots_z = num_knots + if isinstance(spline_degree, int): + spline_degree_x = spline_degree + spline_degree_z = spline_degree + else: + spline_degree_x, spline_degree_z = spline_degree + + assert_array_equal( + fitter.pspline.basis_x.shape, + (len(x), num_knots_x + spline_degree_x - 1) + ) + assert_array_equal( + fitter.pspline.basis_z.shape, + (len(z), num_knots_z + spline_degree_z - 1) + ) + + +@pytest.mark.parametrize('lam', (1, 20, (3, 10))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('num_knots', (20, 51, (20, 30))) +def test_setup_spline_diff_matrix(data_fixture2d, lam, diff_order, spline_degree, num_knots): + """Ensures output difference matrix diagonal data is in desired format.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + _ = algorithm._setup_spline( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + + ( + num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs( + num_knots=num_knots, spline_degree=spline_degree, lam=lam, diff_order=diff_order + ) + + num_bases_x = num_knots_x + spline_degree_x - 1 + num_bases_z = num_knots_z + spline_degree_z - 1 + + D1 = difference_matrix(num_bases_x, diff_order_x) + D2 = difference_matrix(num_bases_z, diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases_z)) + P2 = lam_z * kron(identity(num_bases_x), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + algorithm.pspline.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + + +@pytest.mark.filterwarnings('ignore::UserWarning') +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4)) +@pytest.mark.parametrize('num_knots', (5, 50, 100)) +def test_setup_spline_too_high_diff_order(small_data2d, spline_degree, num_knots): + """ + Ensures an exception is raised when the difference order is >= number of basis functions. + + The number of basis functions is equal to the number of knots + the spline degree - 1. + Tests both difference order equal to and greater than the number of basis functions. + + """ + num_z, num_x = small_data2d.shape + diff_order = num_knots + spline_degree - 1 + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=spline_degree, num_knots=num_knots, + penalized=True, diff_order=diff_order + ) + + diff_order += 1 + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=spline_degree, num_knots=num_knots, + penalized=True, diff_order=diff_order + ) + + +@pytest.mark.parametrize('num_knots', (0, 1)) +def test_setup_spline_too_few_knots(small_data2d, num_knots): + """Ensures an error is raised if the number of knots is less than 2.""" + num_z, num_x = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=3, num_knots=num_knots, + penalized=True, diff_order=1 + ) + + +def test_setup_spline_wrong_weight_shape(small_data2d): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + num_z, num_x = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=weights + ) + + +@pytest.mark.parametrize('diff_order', (0, -1)) +def test_setup_spline_diff_matrix_fails(small_data2d, diff_order): + """Ensures using a diff_order < 1 with _setup_spline raises an exception.""" + num_z, num_x = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, diff_order=diff_order + ) + + +@pytest.mark.parametrize('diff_order', (5, 6)) +def test_setup_spline_diff_matrix_warns(small_data2d, diff_order): + """Ensures using a diff_order > 4 with _setup_spline raises a warning.""" + num_z, num_x = small_data2d.shape + with pytest.warns(ParameterWarning): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, diff_order=diff_order + ) + + +def test_setup_spline_negative_lam_fails(small_data2d): + """Ensures a negative lam value fails.""" + num_z, num_x = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, lam=-1 + ) + + +def test_setup_spline_array_lam(small_data2d): + """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" + num_z, num_x = small_data2d.shape + _algorithm_setup._Algorithm2D( + np.arange(num_x), np.arange(num_z) + )._setup_spline(small_data2d, lam=[1]) + _algorithm_setup._Algorithm2D( + np.arange(num_x), np.arange(num_z) + )._setup_spline(small_data2d, lam=[1, 2]) + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, lam=[1, 2, 3] + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_spline_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones_like(small_data2d) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones_like(small_data2d) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + + _, weight_array = algorithm._setup_spline( + small_data2d, lam=1, diff_order=2, weights=weights + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +@pytest.mark.parametrize('input_x', (True, False)) +@pytest.mark.parametrize('input_z', (True, False)) +@pytest.mark.parametrize('check_finite', (True, False)) +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +def test_algorithm_class_init(input_x, input_z, check_finite, assume_sorted, output_dtype, + change_order): + """Tests the initialization of _Algorithm2D objects.""" + sort_order = slice(0, 10) + expected_x = None + expected_z = None + x = None + z = None + if input_x or input_z: + x_, z_, _ = get_data2d() + if input_x: + x = x_ + if input_z: + z = z_ + + if input_x: + expected_x = x.copy() + if change_order: + x[sort_order] = x[sort_order][::-1] + if assume_sorted: + expected_x[sort_order] = expected_x[sort_order][::-1] + if input_z: + expected_z = z.copy() + if change_order: + z[sort_order] = z[sort_order][::-1] + if assume_sorted: + expected_z[sort_order] = expected_z[sort_order][::-1] + + algorithm = _algorithm_setup._Algorithm2D( + x, z, check_finite=check_finite, assume_sorted=assume_sorted, output_dtype=output_dtype + ) + assert_array_equal(algorithm.x, expected_x) + assert_array_equal(algorithm.z, expected_z) + assert algorithm._check_finite == check_finite + assert algorithm._dtype == output_dtype + + expected_shape = [None, None] + if input_x: + expected_shape[1] = len(x) + if input_z: + expected_shape[0] = len(z) + assert algorithm._len == expected_shape + + if not assume_sorted and change_order and (input_x or input_z): + if input_x and input_z: + x_order = np.arange(len(x)) + z_order = np.arange(len(z)) + for order in (x_order, z_order): + order[sort_order] = order[sort_order][::-1] + + for actual, expected in zip( + algorithm._sort_order, (z_order[:, None], x_order[None, :]) + ): + assert_array_equal(actual, expected) + for actual, expected in zip( + algorithm._inverted_order, (z_order.argsort()[:, None], x_order.argsort()[None, :]) + ): + assert_array_equal(actual, expected) + elif input_x: + order = np.arange(len(x)) + order[sort_order] = order[sort_order][::-1] + assert_array_equal(algorithm._sort_order[1], order) + assert_array_equal(algorithm._inverted_order[1], order.argsort()) + assert algorithm._sort_order[0] is Ellipsis + assert algorithm._inverted_order[0] is Ellipsis + else: + order = np.arange(len(z)) + order[sort_order] = order[sort_order][::-1] + assert_array_equal(algorithm._sort_order, order) + assert_array_equal(algorithm._inverted_order, order.argsort()) + else: + assert algorithm._sort_order is None + assert algorithm._inverted_order is None + + # ensure attributes are correctly initialized + assert algorithm.poly_order == -1 + assert algorithm.pspline is None + assert algorithm.whittaker_system is None + assert algorithm.vandermonde is None + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('reshape_baseline', (True, False)) +@pytest.mark.parametrize('three_d', (True, False)) +def test_algorithm_return_results(assume_sorted, output_dtype, change_order, reshape_baseline, + three_d): + """Ensures the _return_results method returns the correctly sorted outputs.""" + x, z, y = get_data2d() + baseline = np.arange(y.size).reshape(y.shape) + # 'a' values will be sorted, 'b' values will be kept the same, 'c' will be reshaped, + # and 'd' will be reshaped and then sorted + params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size), + 'd': np.arange(y.size), + } + if change_order: + x = x[::-1] + z = z[::-1] + y = y[::-1, ::-1] + + expected_params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size).reshape(y.shape), + 'd': np.arange(y.size).reshape(y.shape), + } + if three_d: + baseline = np.array([baseline, baseline]) + expected_baseline = baseline.copy() + if reshape_baseline: + baseline = baseline.reshape(baseline.shape[0], -1) + + if change_order and not assume_sorted: + expected_baseline = expected_baseline[..., ::-1, ::-1] + expected_params['a'] = expected_params['a'][::-1, ::-1] + expected_params['d'] = expected_params['d'][::-1, ::-1] + + algorithm = _algorithm_setup._Algorithm2D( + x, z, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + output, output_params = algorithm._return_results( + baseline, params, dtype=output_dtype, sort_keys=('a', 'd'), + reshape_baseline=reshape_baseline, reshape_keys=('c', 'd'), + ensure_2d=not three_d + ) + + assert_allclose(output, expected_baseline, 1e-16, 1e-16) + assert output.dtype == output_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key]) + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('list_input', (True, False)) +def test_algorithm_register(assume_sorted, output_dtype, change_order, list_input): + """ + Ensures the _register wrapper method returns the correctly sorted and shaped outputs. + + The input y-values within the wrapped function should be correctly sorted + if `assume_sorted` is False, while the output baseline should always match + the ordering of the input y-values. The output params should have an inverted + sort order to also match the ordering of the input y-values if `assume_sorted` + is False. + + """ + x, z, y = get_data2d() + + class SubClass(_algorithm_setup._Algorithm2D): + # 'a' values will be sorted and 'b' values will be kept the same + @_algorithm_setup._Algorithm2D._register(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) + def func(self, data, *args, **kwargs): + """For checking sorting and reshaping output parameters.""" + expected_input = y.copy() + if change_order and not assume_sorted: + expected_input = np.asarray(expected_input)[::-1, ::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_input, 1e-16, 1e-16) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + return 1 * data, params + + @_algorithm_setup._Algorithm2D._register(reshape_baseline=True) + def func2(self, data, *args, **kwargs): + """For checking reshaping output baseline.""" + expected_input = y.copy() + if change_order and not assume_sorted: + expected_input = np.asarray(expected_input)[::-1, ::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_input, 1e-16, 1e-16) + + return 1 * data.flatten(), {} + + @_algorithm_setup._Algorithm2D._register + def func3(self, data, *args, **kwargs): + """For checking empty decorator.""" + expected_input = y.copy() + if change_order and not assume_sorted: + expected_input = np.asarray(expected_input)[::-1, ::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_input, 1e-16, 1e-16) + + return 1 * data, {} + + if change_order: + x = x[::-1] + z = z[::-1] + y = y[::-1, ::-1] + expected_params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size).reshape(y.shape), + 'd': np.arange(y.size).reshape(y.shape), + } + expected_baseline = (1 * y).astype(output_dtype) + if list_input: + x = x.tolist() + z = z.tolist() + y = y.tolist() + + if change_order and not assume_sorted: + # if assume_sorted is False, the param order should be inverted to match + # the input y-order + expected_params['a'] = expected_params['a'][::-1, ::-1] + expected_params['d'] = expected_params['d'][::-1, ::-1] + + algorithm = SubClass( + x, z, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + output, output_params = algorithm.func(y) + + # baseline should always match y-order on the output; only sorted within the + # function + assert_allclose(output, expected_baseline, 1e-16, 1e-16) + assert isinstance(output, np.ndarray) + assert output.dtype == output_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key], err_msg=f'{key} failed') + + output2, _ = algorithm.func2(y) + assert_allclose(output2, expected_baseline, 1e-16, 1e-16) + assert isinstance(output2, np.ndarray) + assert output2.dtype == output_dtype + + output3, _ = algorithm.func3(y) + assert_allclose(output3, expected_baseline, 1e-16, 1e-16) + assert isinstance(output3, np.ndarray) + assert output3.dtype == output_dtype + + +def test_override_x(algorithm): + """Ensures the `override_x` method correctly initializes with the new x values.""" + new_len = 20 + new_x = np.arange(new_len) + with pytest.raises(NotImplementedError): + with algorithm._override_x(new_x) as new_algorithm: + assert len(new_algorithm.x) == new_len + assert new_algorithm._len == new_len + assert new_algorithm.poly_order == -1 + assert new_algorithm.vandermonde is None + assert new_algorithm.whittaker_system is None + assert new_algorithm.pspline is None From f00e8f7791e2d6664eddebbf5ae2af555bb60f53 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 8 Jan 2024 20:52:17 -0500 Subject: [PATCH 16/56] TESTS: Finished tests for PSpline2D --- pybaselines/two_d/_spline_utils.py | 21 +++ tests/two_d/test_spline_utils.py | 250 +++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 tests/two_d/test_spline_utils.py diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index ee79cea..bc09b05 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -254,3 +254,24 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): output = self.basis_z @ self.coef @ self.basis_x.T return output + + @property + def tck(self): + """ + The knots, spline coefficients, and spline degree to reconstruct the spline. + + Convenience function for potentially reconstructing the last solved spline with outside + modules, although not such if Scipy has a 2D equiavlent to its `BSpline`. + + Raises + ------ + ValueError + Raised if `solve_pspline` has not been called yet, meaning that the spline has not + yet been constructed. + + """ + if self.coef is None: + raise ValueError('No spline coefficients, need to call "solve_pspline" first.') + return ( + self.knots_x, self.knots_z, self.coef, self.spline_degree[0], self.spline_degree[1] + ) diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py new file mode 100644 index 0000000..6d12327 --- /dev/null +++ b/tests/two_d/test_spline_utils.py @@ -0,0 +1,250 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.two_d._spline_utils. + +@author: Donald Erb +Created on January 8, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.sparse import identity, issparse, kron +from scipy.sparse.linalg import spsolve + +from pybaselines.two_d import _spline_utils +from pybaselines.utils import difference_matrix + +from ..conftest import get_2dspline_inputs + + +@pytest.mark.parametrize('num_knots', (10, 40, (10, 20))) +@pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, 5, (2, 3))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """ + Tests the accuracy of the penalized spline solvers. + + Uses the nieve way to solve 2D PSplines from Eilers's paper as the expected result, which + uses the flattened `y` and weight values, while pybaselines uses the second, more efficient + method in Eiler's paper which directly uses the 2D `y` and weights. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + x, z, y = data_fixture2d + ( + num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_x = _spline_utils._spline_knots(x, num_knots_x, spline_degree_x, True) + basis_x = _spline_utils._spline_basis(x, knots_x, spline_degree_x) + + knots_z = _spline_utils._spline_knots(z, num_knots_z, spline_degree_z, True) + basis_z = _spline_utils._spline_basis(z, knots_z, spline_degree_z) + + num_bases = (basis_x.shape[1], basis_z.shape[1]) + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1).astype(float, copy=False) + + basis = kron(basis_z, basis_x) + CWT = basis.multiply( + np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + expected_coeffs = spsolve(CWT @ basis + penalty, CWT @ y.flatten()) + expected_result = basis @ expected_coeffs + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, + lam=lam, diff_order=diff_order, check_finite=False + ) + + output = pspline.solve_pspline(y, weights=weights.reshape(y.shape)) + + assert_allclose(pspline.coef.flatten(), expected_coeffs, rtol=1e-8, atol=1e-8) + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('spline_degree', (1, 2, 3, [2, 3])) +@pytest.mark.parametrize('num_knots', (10, 50, [20, 30])) +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """ + Ensure the PSpline setup is correct. + + Since `allow_pentapy` is always False for PSpline, the `lower` attribute of the + PenalizedSystem will always equal the input `allow_lower` and the `reversed` + attribute will be equal to the bool of the input `reverse_diags` input (ie. None + will also be False). + + """ + x, z, y = data_fixture2d + ( + num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_x = _spline_utils._spline_knots(x, num_knots_x, spline_degree_x, True) + basis_x = _spline_utils._spline_basis(x, knots_x, spline_degree_x) + + knots_z = _spline_utils._spline_knots(z, num_knots_z, spline_degree_z, True) + basis_z = _spline_utils._spline_basis(z, knots_z, spline_degree_z) + + num_bases = (basis_x.shape[1], basis_z.shape[1]) + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, + lam=lam, diff_order=diff_order, check_finite=False + ) + + assert pspline.basis_x.shape == (len(x), len(knots_x) - spline_degree_x - 1) + assert pspline.basis_z.shape == (len(z), len(knots_z) - spline_degree_z - 1) + assert_array_equal(pspline._num_bases, num_bases) + + assert issparse(pspline.basis_x) + assert issparse(pspline.basis_z) + + assert_allclose(pspline.basis_x.toarray(), basis_x.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.basis_z.toarray(), basis_z.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12) + + assert_array_equal(pspline.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(pspline.num_knots, (num_knots_x, num_knots_z)) + assert_array_equal(pspline.spline_degree, (spline_degree_x, spline_degree_z)) + assert_array_equal(pspline.lam, (lam_x, lam_z)) + assert pspline.coef is None # None since the solve method has not been called + assert pspline.basis_x.shape == (len(x), num_knots_x + spline_degree_x - 1) + assert pspline.basis_z.shape == (len(z), num_knots_z + spline_degree_z - 1) + assert_array_equal( + pspline._num_bases, + (num_knots_x + spline_degree_x - 1, num_knots_z + spline_degree_z - 1) + ) + assert pspline.knots_x.shape == (num_knots_x + 2 * spline_degree_x,) + assert pspline.knots_z.shape == (num_knots_z + 2 * spline_degree_z,) + assert isinstance(pspline.x, np.ndarray) + assert isinstance(pspline.z, np.ndarray) + + +def test_pspline_same_basis(data_fixture2d): + """Ensures PSpline2D.same_basis works correctly.""" + x, z, y = data_fixture2d + + num_knots = (20, 30) + spline_degree = (2, 3) + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, check_finite=False + ) + + assert pspline.same_basis(num_knots, spline_degree) + assert not pspline.same_basis(num_knots[::-1], spline_degree) + assert not pspline.same_basis(num_knots, spline_degree[::-1]) + assert not pspline.same_basis(10, spline_degree) + assert not pspline.same_basis(num_knots, 1) + assert not pspline.same_basis(10, 1) + + +@pytest.mark.parametrize('diff_order', (0, [0, 0], [1, 0])) +def test_pspline_diff_order_zero_fails(data_fixture2d, diff_order): + """Ensures a difference order of 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, diff_order=diff_order) + + +@pytest.mark.parametrize('spline_degree', (-2, -1, [-1, 1], [1, -1])) +def test_pspline_negative_spline_degree_fails(data_fixture2d, spline_degree): + """Ensures a spline degree less than 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, spline_degree=spline_degree) + + +def test_pspline_non_finite_fails(): + """Ensure non-finite values raise an exception when check_finite is True.""" + x = np.linspace(-1, 1, 100) + z = np.linspace(-1, 1, 50) + original_x_value = x[0] + original_z_value = z[0] + for value in (np.nan, np.inf, -np.inf): + x[0] = value + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, check_finite=True) + x[0] = original_x_value + + for value in (np.nan, np.inf, -np.inf): + z[0] = value + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, check_finite=True) + z[0] = original_z_value + + +@pytest.mark.parametrize('spline_degree', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('num_knots', (10, 40, (20, 30))) +@pytest.mark.parametrize('diff_order', (1, 2, (1, 2))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """Ensures the tck attribute can correctly recreate the solved spline.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, diff_order=diff_order, lam=lam + ) + _ = pspline.solve_pspline(y, weights=np.ones_like(y)) + + # ensure tck is the knots, coefficients, and spline degree + assert len(pspline.tck) == 5 + knots_x, knots_z, coeffs, degree_x, degree_z = pspline.tck + + assert_allclose(knots_x, pspline.knots_x, rtol=1e-12) + assert_allclose(knots_z, pspline.knots_z, rtol=1e-12) + assert_allclose(coeffs, pspline.coef, rtol=1e-12) + if isinstance(spline_degree, int): + assert degree_x == spline_degree + assert degree_z == spline_degree + else: + assert degree_x == spline_degree[0] + assert degree_z == spline_degree[1] + + +def test_pspline_tck_none(data_fixture2d): + """Ensures an exception is raised when tck attribute is accessed without first solving once.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D(x, z) + + assert pspline.coef is None + with pytest.raises(ValueError): + pspline.tck + + +def test_pspline_tck_readonly(data_fixture2d): + """Ensures the tck attribute is read-only.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D(x, z) + + with pytest.raises(AttributeError): + pspline.tck = (1, 2, 3) + + pspline.solve_pspline(y, np.ones_like(y)) + with pytest.raises(AttributeError): + pspline.tck = (1, 2, 3) From bb48564b4633141b680a498c2e9f8ace7f7bd984 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:54:22 -0500 Subject: [PATCH 17/56] TEST: Add base class for testing 2D algorithms Also added some meta tests for the polynomial and weights testers. --- tests/conftest.py | 186 ++++++++++++++++++++- tests/test_meta.py | 402 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 576 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 89924fd..1932d1e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -150,7 +150,7 @@ def get_data2d(include_noise=True, num_points=(50, 60)): x_num_points, z_num_points = num_points x_data = np.linspace(1, 100, x_num_points) z_data = np.linspace(1, 100, z_num_points) - X, Z = np.meshgrid(x_data, z_data) + X, Z = np.meshgrid(x_data, z_data, indexing='ij') y_data = ( 500 # constant baseline + gaussian2d(X, Z, 10, 25, 25) @@ -491,12 +491,7 @@ class InputWeightsMixin: weight_keys = ('weights',) def test_input_weights(self, assertion_kwargs=None, **kwargs): - """ - Ensures arrays are correctly sorted within the function. - - Returns the output for further testing. - - """ + """Ensures input weights are correctly sorted within the function.""" # TODO replace with np.random.default_rng when min numpy version is >= 1.17 weights = np.random.RandomState(0).normal(0.8, 0.05, len(self.x)) weights = np.clip(weights, 0, 1).astype(float, copy=False) @@ -521,6 +516,9 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): assertion_kwargs['atol'] = 1e-14 for key in self.weight_keys: + assert key in regular_output_params + assert key in reverse_output_params + assert_allclose( regular_output_params[key], reverse_output_params[key][::-1], **assertion_kwargs @@ -528,3 +526,177 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): assert_allclose( regular_output, self.reverse_array(reverse_output), **assertion_kwargs ) + + +class BaseTester2D: + """ + A base class for testing all 2D algorithms. + + Attributes + ---------- + kwargs : dict + The keyword arguments that will be used as inputs for all default test cases. + + """ + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'func' + checked_keys = None + required_kwargs = None + three_d = False + + @classmethod + def setup_class(cls): + """Sets up the class for testing.""" + cls.x, cls.z, cls.y = get_data2d() + if cls.three_d: + cls.y = np.array((cls.y, cls.y)) + cls.algorithm = cls.algorithm_base(cls.x, cls.z, check_finite=False, assume_sorted=True) + cls.class_func = getattr(cls.algorithm, cls.func_name) + cls.kwargs = cls.required_kwargs if cls.required_kwargs is not None else {} + cls.param_keys = cls.checked_keys if cls.checked_keys is not None else [] + + @classmethod + def teardown_class(cls): + """ + Resets class attributes after testing. + + Probably not needed, but done anyway to catch changes in how pytest works. + + """ + cls.x = None + cls.z = None + cls.y = None + cls.algorithm = None + cls.class_func = None + cls.kwargs = None + cls.param_keys = None + + def test_ensure_wrapped(self): + """Ensures the class method was wrapped using _Algorithm._register to control inputs.""" + assert hasattr(self.class_func, '__wrapped__') + + @pytest.mark.parametrize('new_instance', (True, False)) + def test_unchanged_data(self, new_instance, **kwargs): + """Ensures that input data is unchanged by the function.""" + x, z, y = get_data2d() + x2, z2, y2 = get_data2d() + if self.three_d: + y = np.array((y, y)) + y2 = np.array((y2, y2)) + + if new_instance: + getattr(self.algorithm_base(x_data=x, z_data=z), self.func_name)( + data=y, **self.kwargs, **kwargs + ) + compared_x = x + compared_z = z + else: + self.class_func(data=y, **self.kwargs, **kwargs) + compared_x = self.x + compared_z = self.z + + assert_array_equal(y2, y, err_msg='the y-data was changed by the algorithm') + assert_array_equal(x2, compared_x, err_msg='the x-data was changed by the algorithm') + assert_array_equal(z2, compared_z, err_msg='the z-data was changed by the algorithm') + + def test_repeated_fits(self): + """Ensures the setup is properly reset when using class api.""" + first_output = self.class_func(data=self.y, **self.kwargs) + second_output = self.class_func(data=self.y, **self.kwargs) + + assert_allclose(first_output[0], second_output[0], 1e-14) + + def test_list_input(self, **assertion_kwargs): + """Ensures that function works the same for both array and list inputs.""" + output_array = self.class_func(data=self.y, **self.kwargs) + output_list = self.class_func(data=self.y.tolist(), **self.kwargs) + + assert_allclose( + output_array[0], output_list[0], + err_msg='algorithm output is different for arrays vs lists', **assertion_kwargs + ) + for key in output_array[1]: + assert key in output_list[1] + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z, **assertion_kwargs): + """ + Ensures that function output is the same when no x and/or z is input. + + Usually only valid for evenly spaced data, such as used for testing. + + """ + if has_x and has_z: + return # the one test case that would not produce any difference so skip to save time + output_with = self.class_func(data=self.y, **self.kwargs) + + input_x = self.x if has_x else None + input_z = self.z if has_z else None + output_without = getattr( + self.algorithm_base(x_data=input_x, z_data=input_z), self.func_name + )(data=self.y, **self.kwargs) + + assert_allclose( + output_with[0], output_without[0], + err_msg='algorithm output is different with no x-values and/or z-values', + **assertion_kwargs + ) + + def test_output(self, additional_keys=None, **kwargs): + """ + Ensures that the output has the desired format. + + Ensures that output has two elements, a numpy array and a param dictionary, + and that the output baseline is the same shape as the input y-data. + + Parameters + ---------- + additional_keys : Iterable(str, ...), optional + Additional keys to check for in the output parameter dictionary. Default is None. + **kwargs + Additional keyword arguments to pass to the function. + + """ + output = self.class_func(data=self.y, **self.kwargs, **kwargs) + + assert len(output) == 2, 'algorithm output should have two items' + assert isinstance(output[0], np.ndarray), 'output[0] should be a numpy ndarray' + assert isinstance(output[1], dict), 'output[1] should be a dictionary' + assert self.y.shape == output[0].shape, 'output[0] must have same shape as y-data' + + if additional_keys is not None: + total_keys = list(self.param_keys) + list(additional_keys) + else: + total_keys = self.param_keys + # check all entries in output param dictionary + for key in total_keys: + if key not in output[1]: + assert False, f'key "{key}" missing from param dictionary' + output[1].pop(key) + if output[1]: + assert False, f'unchecked keys in param dictionary: {output[1]}' + + def test_xz_ordering(self, assertion_kwargs=None, **kwargs): + """Ensures arrays are correctly sorted within the function.""" + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + + regular_inputs_result = self.class_func(data=self.y, **self.kwargs, **kwargs)[0] + reverse_inputs_result = getattr(reverse_fitter, self.func_name)( + data=self.reverse_array(self.y), **self.kwargs, **kwargs + )[0] + + if assertion_kwargs is None: + assertion_kwargs = {} + if 'rtol' not in assertion_kwargs: + assertion_kwargs['rtol'] = 1e-10 + + assert_allclose( + regular_inputs_result, self.reverse_array(reverse_inputs_result), **assertion_kwargs + ) + + def reverse_array(self, array): + """Reverses the input along the last two dimensions.""" + return array[..., ::-1, ::-1] diff --git a/tests/test_meta.py b/tests/test_meta.py index 75d483c..ccea8c4 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -12,7 +12,10 @@ from numpy.testing import assert_allclose import pytest -from .conftest import BaseTester, BasePolyTester, dummy_wrapper, get_data +from .conftest import ( + BaseTester, BaseTester2D, BasePolyTester, InputWeightsMixin, dummy_wrapper, get_data, + get_data2d +) class DummyModule: @@ -48,6 +51,35 @@ def good_poly_func(data, x_data=None, return_coef=False, **kwargs): return baseline, params + @staticmethod + def bad_poly_func(data, x_data=None, return_coef=False, **kwargs): + """A bad polynomial algorithm.""" + params = {'a': 1} + if not return_coef: + params['coef'] = np.zeros(5) + + return np.ones_like(data), params + + @staticmethod + def good_weights_func(data, x_data=None, weights=None, **kwargs): + """A good algorithm that can take weights.""" + return np.ones_like(data), {'a': 1, 'weights': np.ones_like(data)} + + @staticmethod + def good_mask_func(data, x_data=None, weights=None, **kwargs): + """A good algorithm that can take weights and outputs them as the 'mask' key.""" + return np.ones_like(data), {'a': 1, 'mask': np.ones_like(data)} + + @staticmethod + def bad_weights_func(data, x_data=None, weights=None, **kwargs): + """An algorithm that incorrectly uses weights.""" + return np.ones_like(data), {'a': 1, 'weights': np.arange(len(data))} + + @staticmethod + def bad_weights_func_no_weights(data, x_data=None, weights=None, **kwargs): + """An algorithm that does not include weights in the output parameters.""" + return np.ones_like(data), {'a': 1} + @staticmethod def change_y(data, x_data=None): """Changes the input data values, which is unwanted.""" @@ -148,10 +180,11 @@ def different_x_ordering(data=None, x_data=None): class DummyAlgorithm: - """A dummy object to serve as a fake Algorithm subclass.""" + """A dummy object to serve as a fake Algorithm and Algorithm2D subclass.""" - def __init__(self, x_data=None, *args, **kwargs): + def __init__(self, x_data=None, z_data=None, *args, **kwargs): self.x = x_data + self.z = z_data self.calls = 0 @dummy_wrapper @@ -171,6 +204,41 @@ def good_poly_func(self, data, return_coef=False, **kwargs): data=data, x_data=self.x, return_coef=return_coef, **kwargs ) + @dummy_wrapper + def bad_poly_func(self, data, return_coef=False, **kwargs): + """A bad polynomial algorithm.""" + return DummyModule.bad_poly_func( + data=data, x_data=self.x, return_coef=return_coef, **kwargs + ) + + @dummy_wrapper + def good_weights_func(self, data, weights=None, **kwargs): + """A good algorithm that can take weights.""" + return DummyModule.good_weights_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def good_mask_func(self, data, weights=None, **kwargs): + """A good algorithm that can take weights and outputs them as the 'mask' key.""" + return DummyModule.good_mask_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def bad_weights_func(self, data, weights=None, **kwargs): + """An algorithm that incorrectly uses weights.""" + return DummyModule.bad_weights_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def bad_weights_func_no_weights(self, data, weights=None, **kwargs): + """An algorithm that does not include weights in the output parameters.""" + return DummyModule.bad_weights_func_no_weights( + data=data, x_data=self.x, weights=weights, **kwargs + ) + @dummy_wrapper def change_y(self, data): """Changes the input data values, which is unwanted.""" @@ -180,7 +248,13 @@ def change_y(self, data): @dummy_wrapper def change_x(self, data): """Changes the input x-data values, which is unwanted.""" - self.x[0] = 200000 + self.x[0] = self.x[0] + 5 + return data, {} + + @dummy_wrapper + def change_z(self, data): + """Changes the input x-data values, which is unwanted.""" + self.z[0] += 5 return data, {} @dummy_wrapper @@ -268,6 +342,24 @@ def different_x_ordering(self, data=None): """Gives different output depending on the x-value sorting.""" return data[np.argsort(self.x)], {} + @dummy_wrapper + def different_z_ordering(self, data=None): + """Gives different output depending on the z-value sorting.""" + return data[(..., np.argsort(self.z))], {} + + @dummy_wrapper + def different_xz_output(self, data=None): + """Gives different output depending on the x-values and z-values.""" + if self.x is None or self.z is None: + return data, {} + else: + return 10 * data, {} + + @dummy_wrapper + def different_xz_ordering(self, data=None): + """Gives different output depending on the x-value and z-value sorting.""" + return data[np.argsort(self.x)[:, None], np.argsort(self.z)[None, :]], {} + class TestBaseTesterWorks(BaseTester): """Ensures a basic subclass of BaseTester works.""" @@ -289,10 +381,20 @@ def test_setup(self): assert callable(self.class_func) assert self.kwargs == {'key': 1} assert self.param_keys == ['a'] + assert not self.two_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1]) class TestBaseTesterWorks2d(BaseTester): - """Ensures a basic subclass of BaseTester works for a two dimensional algorithm.""" + """ + Ensures a basic subclass of BaseTester works for a two dimensional algorithm. + + Note: this is for one dimensional algorithms that take two dimensional data, not + for two dimensional algorithms. + """ module = DummyModule algorithm_base = DummyAlgorithm @@ -306,6 +408,12 @@ def test_setup(self): assert_allclose(self.y, np.vstack((expected_y, expected_y)), rtol=1e-14, atol=1e-14) assert self.kwargs == {} assert self.param_keys == [] + assert self.two_d + + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1]) class TestBaseTesterFailures(BaseTester): @@ -477,3 +585,287 @@ class TestBasePolyTesterWorks(BasePolyTester): algorithm_base = DummyAlgorithm func_name = 'good_poly_func' checked_keys = ['a'] + + +class TestBasePolyTesterFailures(BasePolyTester): + """Tests the various BasePolyTester methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'bad_poly_func' + checked_keys = ['a'] + + @pytest.mark.parametrize('return_coef', (True, False)) + def test_output(self, return_coef): + """Ensures failure if the coefficients are not correctly returned.""" + with pytest.raises(AssertionError): + super().test_output(return_coef=return_coef) + + def test_output_coefs(self): + """Ensures failure if the coefficients cannot recreate the output baseline.""" + with pytest.raises(AssertionError): + super().test_output_coefs() + + +class TestInputWeightsMixinWorks(BaseTester, InputWeightsMixin): + """Ensures a basic subclass of InputWeightsMixin works.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_weights_func' + checked_keys = ['a', 'weights'] + required_kwargs = {'key': 1} + + @contextmanager + def set_func(self, func_name, checked_keys=None, weight_key=None): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + original_weight_key = self.weight_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.weight_keys = weight_key + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.weight_keys = original_weight_key + self.__class__.setup_class() + + def test_input_weights(self): + """Ensures weight testing works for different weight keys in the parameter dictionary.""" + super().test_input_weights() + with self.set_func('good_mask_func', weight_key=('mask',), checked_keys=('a', 'mask')): + super().test_input_weights() + + +class TestInputWeightsMixinFails(BaseTester, InputWeightsMixin): + """Tests the various BasePolyTester methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'bad_weights_func' + checked_keys = ['a', 'weights'] + required_kwargs = {'key': 1} + + @contextmanager + def set_func(self, func_name, checked_keys=None, weight_key=('weights',)): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + original_weight_key = self.weight_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.weight_keys = weight_key + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.weight_keys = original_weight_key + self.__class__.setup_class() + + def test_input_weights(self): + """Ensures weight testing works for different weight keys in the parameter dictionary.""" + with pytest.raises(AssertionError): + super().test_input_weights() + + def test_has_no_weights(self): + """Ensures failure occurs if the weight key is not present in the parameter dictionary.""" + with self.set_func('bad_weights_func_no_weights', checked_keys=('a',)): + with pytest.raises(AssertionError): + super().test_input_weights() + + +class TestBaseTester2DWorks(BaseTester2D): + """Ensures a basic subclass of BaseTester2D works.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_func' + checked_keys = ['a'] + required_kwargs = {'key': 1} + + def test_setup(self): + """Ensures the `setup_class` class method is done correctly.""" + expected_x, expected_z, expected_y = get_data2d() + assert_allclose(self.x, expected_x, rtol=1e-14, atol=1e-14) + assert_allclose(self.z, expected_z, rtol=1e-14, atol=1e-14) + assert_allclose(self.y, expected_y, rtol=1e-14, atol=1e-14) + assert issubclass(self.algorithm_base, DummyAlgorithm) + assert isinstance(self.algorithm, DummyAlgorithm) + assert callable(self.class_func) + assert self.kwargs == {'key': 1} + assert self.param_keys == ['a'] + assert not self.three_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1, ::-1]) + + +class TestBaseTester2DWorks3d(BaseTester2D): + """ + Ensures a basic subclass of BaseTester works for a two dimensional algorithm. + + Note: this is for two dimensional algorithms that take three dimensional data, not + for three dimensional algorithms. + """ + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_func2' + three_d = True + + def test_setup(self): + """Ensures the `setup_class` class method is done correctly.""" + expected_x, expected_z, expected_y = get_data2d() + assert_allclose(self.x, expected_x, rtol=1e-14, atol=1e-14) + assert_allclose(self.z, expected_z, rtol=1e-14, atol=1e-14) + assert_allclose(self.y, np.array((expected_y, expected_y)), rtol=1e-14, atol=1e-14) + assert self.kwargs == {} + assert self.param_keys == [] + assert self.three_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1, ::-1]) + + +class TestBaseTester2DFailures(BaseTester2D): + """Tests the various BaseTester2D methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'no_func' + + @contextmanager + def set_func(self, func_name, checked_keys=None): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.setup_class() + + def test_ensure_wrapped(self): + """Ensures no wrapper fails.""" + with self.set_func('no_wrapper'): + with pytest.raises(AssertionError): + super().test_ensure_wrapped() + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('func', ('change_x', 'change_y', 'change_z')) + def test_unchanged_data(self, new_instance, func): + """Ensures changing the x and y data fails.""" + with self.set_func(func): + with pytest.raises(AssertionError): + super().test_unchanged_data(new_instance) + + def test_repeated_fits(self): + """Ensures no wrapper fails.""" + with self.set_func('repitition_changes'): + with pytest.raises(AssertionError): + super().test_repeated_fits() + + def test_list_input(self): + """Ensures test fails when func gives different outputs for different input types.""" + with self.set_func('different_output'): + with pytest.raises(AssertionError): + super().test_list_input() + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures failure occurs when output changes when no x or z is given.""" + if has_x and has_z: + return # the one test case that would not produce any difference, so just skip + with self.set_func('different_xz_output'): + with pytest.raises(AssertionError): + super().test_no_xz(has_x, has_z) + + def test_output(self): + """Ensures failure occurs when the output is not correct.""" + with self.set_func('single_output'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_list'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_nondict'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_wrong_shape'): + with pytest.raises(AssertionError): + super().test_output() + + # also ensure keys are checked + with self.set_func('good_func'): + with pytest.raises(AssertionError): + super().test_output() + with pytest.raises(AssertionError): + super().test_output(additional_keys=['b', 'c']) + + with self.set_func('good_func', checked_keys=('a', 'b')): + with pytest.raises(AssertionError): + super().test_output() + + @pytest.mark.parametrize('func', + ('different_x_ordering', 'different_z_ordering', 'different_xz_ordering') + ) + def test_xz_ordering(self, func): + """Ensures failure when output is dependent on x-value sorting.""" + with self.set_func(func): + with pytest.raises(AssertionError): + super().test_xz_ordering() + + +class TestBaseTester2DNoFunc(BaseTester2D): + """Ensures the BaseTester2D fails if not setup correctly.""" + + @pytest.mark.parametrize('new_instance', (True, False)) + def test_unchanged_data(self, new_instance): + """Ensures that input data is unchanged by the function.""" + with pytest.raises(NotImplementedError): + super().test_unchanged_data(new_instance) + + def test_repeated_fits(self): + """Ensures the setup is properly reset when using class api.""" + with pytest.raises(NotImplementedError): + super().test_repeated_fits() + + def test_list_input(self): + """Ensures that function works the same for both array and list inputs.""" + with pytest.raises(NotImplementedError): + super().test_list_input() + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures that function output is the same when no x or z is input.""" + if has_x and has_z: + return # the one test case that would not produce any difference, so just skip + with pytest.raises(NotImplementedError): + super().test_no_xz(has_x, has_z) + + def test_output(self): + """Ensures that the output has the desired format.""" + with pytest.raises(NotImplementedError): + super().test_output() + + def test_xz_ordering(self): + """Ensures arrays are correctly sorted within the function.""" + with pytest.raises(NotImplementedError): + super().test_xz_ordering() From 8e24f745e16d787865c470b6d2189079f0d3254b Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:57:25 -0500 Subject: [PATCH 18/56] MAINT: Change dimensionality of Baseline2D Changed from z, x to x, z to represent the rows and columns, respectively. This mixup was causing some internal discrepancies for 2D PSplines and Whittaker system. --- pybaselines/two_d/_algorithm_setup.py | 34 ++++++++++----------- pybaselines/two_d/_spline_utils.py | 15 ++++----- pybaselines/two_d/_whittaker_utils.py | 5 +-- tests/two_d/test_algorithm_setup.py | 44 +++++++++++++-------------- tests/two_d/test_spline_utils.py | 24 +++++++-------- 5 files changed, 62 insertions(+), 60 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index f7e93ce..15e9c7a 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -101,7 +101,7 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=Fa self.x_domain = np.array([-1., 1.]) else: self.x = _check_array(x_data, check_finite=check_finite) - self._len[1] = len(self.x) + self._len[0] = len(self.x) self.x_domain = np.polynomial.polyutils.getdomain(self.x) if not assume_sorted: x_sort_order, x_inverted_order = _determine_sorts(self.x) @@ -113,7 +113,7 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=Fa self.z_domain = np.array([-1., 1.]) else: self.z = _check_array(z_data, check_finite=check_finite) - self._len[0] = len(self.z) + self._len[1] = len(self.z) self.z_domain = np.polynomial.polyutils.getdomain(self.z) if not assume_sorted: z_sort_order, z_inverted_order = _determine_sorts(self.z) @@ -123,15 +123,15 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=Fa if x_sort_order is None and z_sort_order is None: self._sort_order = None self._inverted_order = None - elif x_sort_order is None: - self._sort_order = z_sort_order - self._inverted_order = z_inverted_order elif z_sort_order is None: - self._sort_order = (..., x_sort_order) - self._inverted_order = (..., x_inverted_order) + self._sort_order = x_sort_order + self._inverted_order = x_inverted_order + elif x_sort_order is None: + self._sort_order = (..., z_sort_order) + self._inverted_order = (..., z_inverted_order) else: - self._sort_order = (z_sort_order[:, None], x_sort_order[None, :]) - self._inverted_order = (z_inverted_order[:, None], x_inverted_order[None, :]) + self._sort_order = (x_sort_order[:, None], z_sort_order[None, :]) + self._inverted_order = (x_inverted_order[:, None], z_inverted_order[None, :]) self.whittaker_system = None self.vandermonde = None @@ -250,11 +250,11 @@ def inner(self, data=None, *args, **kwargs): expected_shape = self._len axis = slice(-2, None) elif reset_x: - expected_shape = self._len[1] - axis = -1 - else: expected_shape = self._len[0] axis = -2 + else: + expected_shape = self._len[1] + axis = -1 y = _check_sized_array( data, expected_shape, check_finite=self._check_finite, dtype=dtype, order=order, ensure_1d=False, axis=axis, name='data' @@ -272,16 +272,16 @@ def inner(self, data=None, *args, **kwargs): self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False ) else: - self._len[1] = y.shape[-1] - self.x = np.linspace(-1, 1, self._len[1]) + self._len[0] = y.shape[-2] + self.x = np.linspace(-1, 1, self._len[0]) if reset_z: z_dtype = self.z.dtype self.z = _check_array( self.z, dtype=dtype, order=order, check_finite=False, ensure_1d=False ) else: - self._len[0] = y.shape[-2] - self.z = np.linspace(-1, 1, self._len[0]) + self._len[1] = y.shape[-1] + self.z = np.linspace(-1, 1, self._len[1]) y = _sort_array2d(y, sort_order=self._sort_order) if self._dtype is None: @@ -521,7 +521,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, # rearrange the vandermonde such that it matches the typical A c = b where b # is the flattened version of y and c are the coefficients self.vandermonde = np.polynomial.polynomial.polyvander2d( - *np.meshgrid(mapped_x, mapped_z), [poly_orders[0], poly_orders[1]] + mapped_x[:, None], mapped_z[None, :], [poly_orders[0], poly_orders[1]] ).reshape((-1, (poly_orders[0] + 1) * (poly_orders[1] + 1))) if max_cross is not None: diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index bc09b05..b011846 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -193,8 +193,9 @@ def reset_penalty(self, lam=1, diff_order=2): D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) - P1 = self.lam[0] * sparse.kron(D1.T @ D1, sparse.identity(self._num_bases[1])) - P2 = self.lam[1] * sparse.kron(sparse.identity(self._num_bases[0]), D2.T @ D2) + # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam + P1 = sparse.kron(self.lam[0] * D1.T @ D1, sparse.identity(self._num_bases[1])) + P2 = sparse.kron(sparse.identity(self._num_bases[0]), self.lam[1] * D2.T @ D2) self.penalty = P1 + P2 def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): @@ -237,8 +238,8 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): """ # do not save intermediate results since they are memory intensive for high number of knots F = np.transpose( - (self._G2.T @ weights @ self._G).reshape( - (self._num_bases[1], self._num_bases[1], self._num_bases[0], self._num_bases[0]) + (self._G.T @ weights @ self._G2).reshape( + (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) ), [0, 2, 1, 3] ).reshape( @@ -247,11 +248,11 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): self.coef = spsolve( sparse.csr_matrix(F) + self.penalty, - (self.basis_z.T @ (weights * y) @ self.basis_x).flatten(), + (self.basis_x.T @ (weights * y) @ self.basis_z).flatten(), 'NATURAL' - ).reshape(self._num_bases[1], self._num_bases[0]) + ).reshape(self._num_bases[0], self._num_bases[1]) - output = self.basis_z @ self.coef @ self.basis_x.T + output = self.basis_x @ self.coef @ self.basis_z.T return output diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 005c4e7..133d4ee 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -111,8 +111,9 @@ def reset_penalty(self, lam=1, diff_order=2): D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) - P1 = self.lam[0] * kron(D1.T @ D1, identity(self._num_bases[1])) - P2 = self.lam[1] * kron(identity(self._num_bases[0]), D2.T @ D2) + # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam + P1 = kron(self.lam[0] * D1.T @ D1, identity(self._num_bases[1])) + P2 = kron(identity(self._num_bases[0]), self.lam[1] * D2.T @ D2) self.penalty = P1 + P2 def solve(self, lhs, rhs): diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 84a722c..e1e7c2b 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -27,7 +27,7 @@ def algorithm(small_data2d): pybaselines.two_d._algorithm_setup._Algorithm2D An _Algorithm2D class for testing. """ - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape return _algorithm_setup._Algorithm2D( x_data=np.arange(num_x), z_data=np.arange(num_z), assume_sorted=True, check_finite=False ) @@ -48,11 +48,11 @@ def test_setup_whittaker_diff_matrix(data_fixture2d, lam, diff_order): lam=lam, diff_order=diff_order ) - D1 = difference_matrix(len(z), diff_order_x) - D2 = difference_matrix(len(x), diff_order_z) + D1 = difference_matrix(len(x), diff_order_x) + D2 = difference_matrix(len(z), diff_order_z) - P1 = lam_x * kron(D1.T @ D1, identity(len(x))) - P2 = lam_z * kron(identity(len(z)), D2.T @ D2) + P1 = lam_x * kron(D1.T @ D1, identity(len(z))) + P2 = lam_z * kron(identity(len(x)), D2.T @ D2) expected_penalty = P1 + P2 assert_allclose( @@ -119,7 +119,7 @@ def test_setup_whittaker_negative_lam_fails(small_data2d, algorithm): def test_setup_whittaker_array_lam(small_data2d): """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( small_data2d, lam=[1] ) @@ -202,7 +202,7 @@ def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, incl mapped_x = np.polynomial.polyutils.mapdomain(algorithm.x, algorithm.x_domain, [-1, 1]) mapped_z = np.polynomial.polyutils.mapdomain(algorithm.z, algorithm.z_domain, [-1, 1]) desired_vander = np.polynomial.polynomial.polyvander2d( - *np.meshgrid(mapped_x, mapped_z), (x_order, z_order) + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), (x_order, z_order) ).reshape((-1, (x_order + 1) * (z_order + 1))) assert_allclose(desired_vander, algorithm.vandermonde, 1e-12) @@ -321,7 +321,7 @@ def test_setup_spline_too_high_diff_order(small_data2d, spline_degree, num_knots @pytest.mark.parametrize('num_knots', (0, 1)) def test_setup_spline_too_few_knots(small_data2d, num_knots): """Ensures an error is raised if the number of knots is less than 2.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape with pytest.raises(ValueError): _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( small_data2d, weights=None, spline_degree=3, num_knots=num_knots, @@ -332,7 +332,7 @@ def test_setup_spline_too_few_knots(small_data2d, num_knots): def test_setup_spline_wrong_weight_shape(small_data2d): """Ensures that an exception is raised if input weights and data are different shapes.""" weights = np.ones(np.array(small_data2d.shape) + 1) - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape with pytest.raises(ValueError): _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( small_data2d, weights=weights @@ -342,7 +342,7 @@ def test_setup_spline_wrong_weight_shape(small_data2d): @pytest.mark.parametrize('diff_order', (0, -1)) def test_setup_spline_diff_matrix_fails(small_data2d, diff_order): """Ensures using a diff_order < 1 with _setup_spline raises an exception.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape with pytest.raises(ValueError): _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( small_data2d, diff_order=diff_order @@ -352,7 +352,7 @@ def test_setup_spline_diff_matrix_fails(small_data2d, diff_order): @pytest.mark.parametrize('diff_order', (5, 6)) def test_setup_spline_diff_matrix_warns(small_data2d, diff_order): """Ensures using a diff_order > 4 with _setup_spline raises a warning.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape with pytest.warns(ParameterWarning): _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( small_data2d, diff_order=diff_order @@ -361,7 +361,7 @@ def test_setup_spline_diff_matrix_warns(small_data2d, diff_order): def test_setup_spline_negative_lam_fails(small_data2d): """Ensures a negative lam value fails.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape with pytest.raises(ValueError): _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( small_data2d, lam=-1 @@ -370,7 +370,7 @@ def test_setup_spline_negative_lam_fails(small_data2d): def test_setup_spline_array_lam(small_data2d): """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" - num_z, num_x = small_data2d.shape + num_x, num_z = small_data2d.shape _algorithm_setup._Algorithm2D( np.arange(num_x), np.arange(num_z) )._setup_spline(small_data2d, lam=[1]) @@ -455,9 +455,9 @@ def test_algorithm_class_init(input_x, input_z, check_finite, assume_sorted, out expected_shape = [None, None] if input_x: - expected_shape[1] = len(x) + expected_shape[0] = len(x) if input_z: - expected_shape[0] = len(z) + expected_shape[1] = len(z) assert algorithm._len == expected_shape if not assume_sorted and change_order and (input_x or input_z): @@ -468,25 +468,25 @@ def test_algorithm_class_init(input_x, input_z, check_finite, assume_sorted, out order[sort_order] = order[sort_order][::-1] for actual, expected in zip( - algorithm._sort_order, (z_order[:, None], x_order[None, :]) + algorithm._sort_order, (x_order[:, None], z_order[None, :]) ): assert_array_equal(actual, expected) for actual, expected in zip( - algorithm._inverted_order, (z_order.argsort()[:, None], x_order.argsort()[None, :]) + algorithm._inverted_order, (x_order.argsort()[:, None], z_order.argsort()[None, :]) ): assert_array_equal(actual, expected) elif input_x: order = np.arange(len(x)) order[sort_order] = order[sort_order][::-1] + assert_array_equal(algorithm._sort_order, order) + assert_array_equal(algorithm._inverted_order, order.argsort()) + else: + order = np.arange(len(z)) + order[sort_order] = order[sort_order][::-1] assert_array_equal(algorithm._sort_order[1], order) assert_array_equal(algorithm._inverted_order[1], order.argsort()) assert algorithm._sort_order[0] is Ellipsis assert algorithm._inverted_order[0] is Ellipsis - else: - order = np.arange(len(z)) - order[sort_order] = order[sort_order][::-1] - assert_array_equal(algorithm._sort_order, order) - assert_array_equal(algorithm._inverted_order, order.argsort()) else: assert algorithm._sort_order is None assert algorithm._inverted_order is None diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 6d12327..9223e10 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -53,7 +53,7 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la weights = np.random.RandomState(0).normal(0.8, 0.05, y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) - basis = kron(basis_z, basis_x) + basis = kron(basis_x, basis_z) CWT = basis.multiply( np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) ).T @@ -74,8 +74,8 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la output = pspline.solve_pspline(y, weights=weights.reshape(y.shape)) - assert_allclose(pspline.coef.flatten(), expected_coeffs, rtol=1e-8, atol=1e-8) assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + assert_allclose(pspline.coef.flatten(), expected_coeffs, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('spline_degree', (1, 2, 3, [2, 3])) @@ -83,15 +83,7 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam): - """ - Ensure the PSpline setup is correct. - - Since `allow_pentapy` is always False for PSpline, the `lower` attribute of the - PenalizedSystem will always equal the input `allow_lower` and the `reversed` - attribute will be equal to the bool of the input `reverse_diags` input (ie. None - will also be False). - - """ + """Ensure the PSpline2D setup is correct.""" x, z, y = data_fixture2d ( num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, @@ -165,7 +157,7 @@ def test_pspline_same_basis(data_fixture2d): assert not pspline.same_basis(10, 1) -@pytest.mark.parametrize('diff_order', (0, [0, 0], [1, 0])) +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) def test_pspline_diff_order_zero_fails(data_fixture2d, diff_order): """Ensures a difference order of 0 fails.""" x, z, y = data_fixture2d @@ -181,6 +173,14 @@ def test_pspline_negative_spline_degree_fails(data_fixture2d, spline_degree): _spline_utils.PSpline2D(x, z, spline_degree=spline_degree) +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_pspline_negative_lam_fails(data_fixture2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, lam=lam) + + def test_pspline_non_finite_fails(): """Ensure non-finite values raise an exception when check_finite is True.""" x = np.linspace(-1, 1, 100) From 280f6b43580eddc46b02a48caa6d06f169e730a1 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:58:58 -0500 Subject: [PATCH 19/56] MAINT: Fixed 2D Whittaker functions Only the 2D version of arpls was implemented correctly. All the others were just copy pasted from the 1d case and did not work properly in 2D. Wrote tests for setting up Whittaker systems. --- pybaselines/two_d/whittaker.py | 45 +++++---- tests/two_d/test_whittaker_utils.py | 143 ++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+), 21 deletions(-) create mode 100644 tests/two_d/test_whittaker_utils.py diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index 3ed2e5e..005461e 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -20,7 +20,9 @@ class _Whittaker(_Algorithm2D): """A base class for all Whittaker-smoothing-based algorithms.""" - @_Algorithm2D._register(sort_keys=('weights',)) + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ Fits the baseline using asymmetric least squares (AsLS) fitting. @@ -79,13 +81,12 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh if not 0 < p < 1: raise ValueError('p must be between 0 and 1') y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - main_diag_idx = self.whittaker_system.main_diagonal_index - main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y, overwrite_b=True + self.whittaker_system.penalty, weight_array * y ) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) @@ -98,7 +99,9 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ Adaptive iteratively reweighted penalized least squares (airPLS) baseline. @@ -147,19 +150,17 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non data, lam, diff_order, weights, copy_weights=True ) y_l1_norm = np.abs(y).sum() - main_diag_idx = self.whittaker_system.main_diagonal_index - main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) # Have to have extensive error handling since the weights can all become # very small due to the exp(i) term if too many iterations are performed; # checking the negative residual length usually prevents any errors, but # sometimes not so have to also catch any errors from the solvers for i in range(1, max_iter + 2): - self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) try: output = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y, overwrite_b=True, - check_output=True + self.whittaker_system.penalty, weight_array * y ) except np.linalg.LinAlgError: warnings.warn( @@ -264,7 +265,9 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). @@ -311,13 +314,12 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non """ y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - main_diag_idx = self.whittaker_system.main_diagonal_index - main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): - self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y, overwrite_b=True + self.whittaker_system.penalty, weight_array * y ) new_weights = _weighting._iarpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) @@ -342,7 +344,9 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -420,13 +424,12 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) if k is None: k = np.std(y) / 10 - main_diag_idx = self.whittaker_system.main_diagonal_index - main_diagonal = self.whittaker_system.penalty[main_diag_idx].copy() + main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - self.whittaker_system.penalty[main_diag_idx] = main_diagonal + weight_array + self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y, overwrite_b=True + self.whittaker_system.penalty, weight_array * y ) new_weights = _weighting._psalsa(y, baseline, p, k, self._len) calc_difference = relative_difference(weight_array, new_weights) diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py new file mode 100644 index 0000000..26e19e1 --- /dev/null +++ b/tests/two_d/test_whittaker_utils.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines._banded_utils. + +@author: Donald Erb +Created on Dec. 11, 2021 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.sparse import identity, issparse, kron +from scipy.sparse.linalg import spsolve + +from pybaselines.two_d import _spline_utils, _whittaker_utils +from pybaselines.utils import difference_matrix + +from ..conftest import get_2dspline_inputs + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_penalized_system(small_data2d, diff_order, lam): + """ + Tests the accuracy of the penalized system solver. + + Not really useful at the moment, but will be mroe useful if the solver changes + from the current basic sparse solver. + + """ + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 0, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + penalized_system.penalty.setdiag(penalized_system.penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.flatten()) + output = penalized_system.solve(penalized_system.penalty, weights * small_data2d.flatten()) + + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_penalized_system_setup(small_data2d, diff_order, lam): + """Ensure the PenalizedSystem2D setup is correct.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + assert_array_equal(penalized_system._num_bases, num_bases) + + assert issparse(penalized_system.penalty) + + assert_allclose(penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) +def test_penalized_system_diff_order_fails(small_data2d, diff_order): + """Ensures a difference order of less than 1 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.PenalizedSystem2D(small_data2d.shape, diff_order=diff_order) + + +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_penalized_system_negative_lam_fails(small_data2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.PenalizedSystem2D(small_data2d.shape, lam=lam) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_compare_to_psplines(data_fixture2d, lam, diff_order): + """ + Ensures 2D Whittaker and PSpline outputs are the same for specific condition. + + If the number of basis functions for splines is equal to the number of data points, and + the spline degree is set to 0, then the spline basis becomes the identity function + and should produce the same analytical equation as Whittaker smoothing. + + Since the 2D PSpline case is known from Eiler's paper, and the implementation of + 2D Whittaker smoothing in pybaselines was adapted from that, need to verify the Whittaker + smoothing implementation. + + """ + x, z, y = data_fixture2d + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=(len(x) + 1, len(z) + 1), spline_degree=0, lam=lam, diff_order=diff_order, + check_finite=False + ) + + # sanity check to ensure it was set up correctly + assert_array_equal(pspline.basis_x.shape, (len(x), len(x))) + assert_array_equal(pspline.basis_z.shape, (len(z)), len(z)) + + whittaker_system = _whittaker_utils.PenalizedSystem2D(y.shape, lam=lam, diff_order=diff_order) + + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, y.shape) + weights = np.clip(weights, 0, 1).astype(float, copy=False) + + whittaker_system.penalty.setdiag(whittaker_system.penalty.diagonal() + weights.ravel()) + + spline_output = pspline.solve_pspline(y, weights=weights) + whittaker_output = whittaker_system.solve(whittaker_system.penalty, weights.ravel() * y.ravel()) + + assert_allclose(whittaker_output.reshape(y.shape), spline_output, rtol=1e-12, atol=1e-12) From 860c4999d9a16dd925222a2c38e6e2aaac9fa892 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 13 Jan 2024 14:33:57 -0500 Subject: [PATCH 20/56] TEST: Finished tests for 2D polynomial algorithms --- pybaselines/two_d/polynomial.py | 26 ++- pybaselines/utils.py | 29 ++- tests/conftest.py | 72 ++++++- tests/test_polynomial.py | 4 +- tests/test_utils.py | 9 +- tests/two_d/test_polynomial.py | 372 ++++++++++++++++++++++++++++++++ 6 files changed, 484 insertions(+), 28 deletions(-) create mode 100644 tests/two_d/test_polynomial.py diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index c668020..c999fa7 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -139,7 +139,9 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No baseline = self.vandermonde @ coef params = {'weights': weight_array} if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params @@ -248,7 +250,9 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params @@ -368,7 +372,9 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params @@ -505,7 +511,9 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params @@ -615,7 +623,9 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params @@ -787,7 +797,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, j_max = j up_count = (y > baseline).sum() - up_down_ratio = up_count / max(1, self._len - up_count) + up_down_ratio = up_count / max(1, self._len[0] * self._len[1] - up_count) calc_difference = up_down_ratio - up_down_ratio_goal tol_history[0, i] = calc_difference if calc_difference > tol_2: @@ -811,7 +821,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, 'threshold': loss_kwargs['threshold'] } if return_coef: - params['coef'] = _convert_coef2d(coef, self.x_domain, self.z_domain) + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) return baseline, params diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 640309d..14defe7 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -449,7 +449,7 @@ def _convert_coef(coef, original_domain): return transformation @ coef -def _convert_coef2d(coef, original_x_domain, original_z_domain): +def _convert_coef2d(coef, poly_degree_x, poly_degree_z, original_x_domain, original_z_domain): """ Scales the polynomial coefficients back to the original domain of the data. @@ -460,10 +460,14 @@ def _convert_coef2d(coef, original_x_domain, original_z_domain): Parameters ---------- - coef : numpy.ndarray, shape (a, b) - The 2d array of coefficients for the polynomial. Should increase in - order. The shape should be (a, b), where a is the polynomial degree + 1 for - the x-values and b is the polynomial degree + 1 for the z-values. + coef : numpy.ndarray, shape (``a * b``,) + The 1d array of coefficients for the polynomial. Should increase in + order. The shape should be (``a * b``,), where `a` is the polynomial degree + 1 for + the x-values and `b` is the polynomial degree + 1 for the z-values. + poly_degree_x : int + The polynomial degree for the x-values + poly_degree_z : int + The polynomial degree for the z-values original_x_domain : Container[float, float] The domain, [min(x), max(x)], of the original x-values used for fitting. original_z_domain : Container[float, float] @@ -472,13 +476,20 @@ def _convert_coef2d(coef, original_x_domain, original_z_domain): Returns ------- numpy.ndarray, shape (a, b) - The array of coefficients scaled for the original domains. + The 2D array of coefficients scaled for the original domains. + + Notes + ----- + Reshapes the coefficient array into the correct shape for use with + :func:`numpy.polynomial.polynomial.polyval2d`. """ - transformation_x = _poly_transform_matrix(coef.shape[0], original_x_domain) - transformation_z = _poly_transform_matrix(coef.shape[1], original_z_domain) + x_order = poly_degree_x + 1 + z_order = poly_degree_z + 1 + transformation_x = _poly_transform_matrix(x_order, original_x_domain) + transformation_z = _poly_transform_matrix(z_order, original_z_domain) - return transformation_x @ coef @ transformation_z.T + return transformation_x @ coef.reshape((x_order, z_order)) @ transformation_z.T def difference_matrix(data_size, diff_order=2, diff_format=None): diff --git a/tests/conftest.py b/tests/conftest.py index 1932d1e..eb29f17 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -149,7 +149,7 @@ def get_data2d(include_noise=True, num_points=(50, 60)): np.random.seed(0) x_num_points, z_num_points = num_points x_data = np.linspace(1, 100, x_num_points) - z_data = np.linspace(1, 100, z_num_points) + z_data = np.linspace(1, 120, z_num_points) X, Z = np.meshgrid(x_data, z_data, indexing='ij') y_data = ( 500 # constant baseline @@ -222,6 +222,12 @@ def no_noise_data_fixture(): return get_data(include_noise=False) +@pytest.fixture() +def no_noise_data_fixture2d(): + """Test fixture that creates x-, z-, and y-data without noise for testing.""" + return get_data2d(include_noise=False) + + def dummy_wrapper(func): """A dummy wrapper to simulate using the _Algorithm._register wrapper function.""" @wraps(func) @@ -486,23 +492,28 @@ def test_output_coefs(self): class InputWeightsMixin: - """A mixin for BaseTester for ensuring input weights are correctly sorted.""" + """A mixin for BaseTester and BaseTester2D for ensuring input weights are correctly sorted.""" weight_keys = ('weights',) def test_input_weights(self, assertion_kwargs=None, **kwargs): """Ensures input weights are correctly sorted within the function.""" # TODO replace with np.random.default_rng when min numpy version is >= 1.17 - weights = np.random.RandomState(0).normal(0.8, 0.05, len(self.x)) + weights = np.random.RandomState(0).normal(0.8, 0.05, self.y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) - reverse_fitter = self.algorithm_base(self.x[::-1], assume_sorted=False) + if hasattr(self, 'two_d'): + reverse_fitter = self.algorithm_base(self.x[::-1], assume_sorted=False) + else: + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + weights = weights.reshape(self.y.shape) regular_output, regular_output_params = self.class_func( data=self.y, weights=weights, **self.kwargs, **kwargs ) reverse_output, reverse_output_params = getattr(reverse_fitter, self.func_name)( - data=self.reverse_array(self.y), weights=weights[::-1], **self.kwargs, **kwargs + data=self.reverse_array(self.y), weights=self.reverse_array(weights), + **self.kwargs, **kwargs ) # sanity check, x should always be sorted correctly @@ -520,7 +531,7 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): assert key in reverse_output_params assert_allclose( - regular_output_params[key], reverse_output_params[key][::-1], + regular_output_params[key], self.reverse_array(reverse_output_params[key]), **assertion_kwargs ) assert_allclose( @@ -700,3 +711,52 @@ def test_xz_ordering(self, assertion_kwargs=None, **kwargs): def reverse_array(self, array): """Reverses the input along the last two dimensions.""" return array[..., ::-1, ::-1] + + +class BasePolyTester2D(BaseTester2D): + """ + A base class for testing 2D polynomial algorithms. + + Checks that the polynomial coefficients are correctly returned and that they correspond + to the polynomial used to create the baseline. + + """ + + @pytest.mark.parametrize('return_coef', (True, False)) + def test_output(self, return_coef): + """Ensures the polynomial coefficients are output if `return_coef` is True.""" + if return_coef: + additional_keys = ['coef'] + else: + additional_keys = None + super().test_output(additional_keys=additional_keys, return_coef=return_coef) + + @pytest.mark.parametrize('poly_order', (1, 2, [2, 3])) + def test_output_coefs(self, poly_order): + """ + Ensures the output coefficients can correctly reproduce the baseline. + + Checks both the manual way using the Vandermonde and directly using numpy's polyval2d. + """ + baseline, params = self.class_func( + data=self.y, poly_order=poly_order, **self.kwargs, return_coef=True + ) + + assert 'coef' in params + + if isinstance(poly_order, int): + x_order = poly_order + z_order = poly_order + else: + x_order, z_order = poly_order + + X, Z = np.meshgrid(self.x, self.z, indexing='ij') + vander = np.polynomial.polynomial.polyvander2d( + X, Z, (x_order, z_order) + ).reshape((-1, (x_order + 1) * (z_order + 1))) + + recreated_poly = (vander @ params['coef'].flatten()).reshape(self.y.shape) + assert_allclose(recreated_poly, baseline, rtol=1e-10, atol=1e-12) + + numpy_poly = np.polynomial.polynomial.polyval2d(X, Z, params['coef']) + assert_allclose(numpy_poly, baseline, rtol=1e-10, atol=1e-12) diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py index 196d29c..00f0888 100644 --- a/tests/test_polynomial.py +++ b/tests/test_polynomial.py @@ -15,7 +15,7 @@ from pybaselines import polynomial from pybaselines.utils import ParameterWarning -from .conftest import BasePolyTester, InputWeightsMixin, get_data +from .conftest import BasePolyTester, InputWeightsMixin from .data import ( LOESS_X, LOESS_Y, QUANTILE_Y, STATSMODELS_LOESS_DELTA, STATSMODELS_LOESS_ITER, STATSMODELS_QUANTILES @@ -209,7 +209,6 @@ class TestLoess(IterativePolynomialTester): @pytest.mark.parametrize('use_threshold', (True, False)) def test_unchanged_data(self, use_class, use_threshold, conserve_memory, delta): """Ensures that input data is unchanged by the function.""" - x, y = get_data() super().test_unchanged_data( use_class, use_threshold=use_threshold, conserve_memory=conserve_memory, delta=delta @@ -460,7 +459,6 @@ class TestGoldindec(PolynomialTester): ) def test_unchanged_data(self, use_class, cost_function): """Ensures that input data is unchanged by the function.""" - x, y = get_data() super().test_unchanged_data(use_class, cost_function=cost_function) @pytest.mark.parametrize('cost_function', ('p_huber', '')) diff --git a/tests/test_utils.py b/tests/test_utils.py index 21bbba6..30449a7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -216,16 +216,19 @@ def test_convert_coef2d(x, z, coef): calc_coef = np.linalg.pinv(vandermonde) @ (y_flat) calc_y = vandermonde @ calc_coef # corresponds to mapped domain - calc_coef = calc_coef.reshape(coef.shape) # sanity check; use slightly higher atol than other checks since # the fit can potentially be off by a bit assert_allclose(calc_y, y_flat, rtol=1e-10, atol=1e-6) - converted_coef = utils._convert_coef2d(calc_coef, x_domain, z_domain) + converted_coef = utils._convert_coef2d( + calc_coef, coef.shape[0] - 1, coef.shape[1] - 1, x_domain, z_domain + ) mapped_X, mapped_Z = np.meshgrid(mapped_x, mapped_z) - mapped_polynomial = np.polynomial.polynomial.polyval2d(mapped_X, mapped_Z, calc_coef) + mapped_polynomial = np.polynomial.polynomial.polyval2d( + mapped_X, mapped_Z, calc_coef.reshape(coef.shape) + ) original_polynomial = np.polynomial.polynomial.polyval2d(X, Z, converted_coef) diff --git a/tests/two_d/test_polynomial.py b/tests/two_d/test_polynomial.py new file mode 100644 index 0000000..1662230 --- /dev/null +++ b/tests/two_d/test_polynomial.py @@ -0,0 +1,372 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.polynomial. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +from math import ceil + +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from pybaselines.two_d import polynomial + +from ..conftest import BasePolyTester2D, InputWeightsMixin + + +class PolynomialTester(BasePolyTester2D, InputWeightsMixin): + """Base testing class for polynomial functions.""" + + module = polynomial + algorithm_base = polynomial._Polynomial + checked_keys = ('weights',) + + +class IterativePolynomialTester(PolynomialTester): + """Base testing class for iterative polynomial functions.""" + + checked_keys = ('weights', 'tol_history') + allows_zero_iteration = True # whether max_iter=0 will return an initial baseline + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + if self.allows_zero_iteration: + assert params['tol_history'].size == max_iter + else: + assert params['tol_history'].size == max_iter + 1 + + +class TestPoly(PolynomialTester): + """Class for testing regular polynomial baseline.""" + + func_name = 'poly' + + +class TestModPoly(IterativePolynomialTester): + """Class for testing modpoly baseline.""" + + func_name = 'modpoly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('use_original', (True, False)) + @pytest.mark.parametrize('mask_initial_peaks', (True, False)) + def test_unchanged_data(self, new_instance, use_original, mask_initial_peaks): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, use_original=use_original, mask_initial_peaks=mask_initial_peaks + ) + + +class TestIModPoly(IterativePolynomialTester): + """Class for testing imodpoly baseline.""" + + func_name = 'imodpoly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('use_original', (True, False)) + @pytest.mark.parametrize('mask_initial_peaks', (True, False)) + def test_unchanged_data(self, new_instance, use_original, mask_initial_peaks): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, use_original=use_original, mask_initial_peaks=mask_initial_peaks + ) + + @pytest.mark.parametrize('num_std', (-1, -0.01, 0, 1)) + def test_negative_num_std_fails(self, num_std): + """Ensures `num_std` values less than 0 raise an exception.""" + if num_std < 0: + with pytest.raises(ValueError): + self.class_func(self.y, num_std=num_std) + else: + self.class_func(self.y, num_std=num_std) + + +class TestPenalizedPoly(IterativePolynomialTester): + """Class for testing penalized_poly baseline.""" + + func_name = 'penalized_poly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize( + 'cost_function', + ( + 'asymmetric_truncated_quadratic', + 'symmetric_truncated_quadratic', + 'a_truncated_quadratic', # test that 'a' and 's' work as well + 's_truncated_quadratic', + 'asymmetric_huber', + 'symmetric_huber', + 'asymmetric_indec', + 'symmetric_indec' + ) + ) + def test_unchanged_data(self, new_instance, cost_function): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data(new_instance, cost_function=cost_function) + + @pytest.mark.parametrize('cost_function', ('huber', 'p_huber', '')) + def test_unknown_cost_function_prefix_fails(self, cost_function): + """Ensures cost function with no prefix or a wrong prefix fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, cost_function=cost_function) + + def test_unknown_cost_function_fails(self): + """Ensures than an unknown cost function fails.""" + with pytest.raises(KeyError): + self.class_func(self.y, cost_function='a_hub') + + @pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) + def test_weighting(self, weight_enum): + """ + Tests that weighting is correctly applied by comparing to other algorithms. + + Weights were not included in the original penalized_poly method developed + in [1]_, so need to ensure that their usage in pybaselines is correct. + + According to [1]_ (and independently verified), the penalized_poly function + with the asymmetric truncated quadratic cost function, a threshold of 0, and + an alpha_factor of 1 should be the same as the output of the ModPoly algorithm. + + Furthermore, the penalized_poly with any symmetric cost function and a threshold + of infinity should equal to the output of a regular polynomial fit. + + Therefore, to ensure that weighting is correct for the penalized_poly, check + both conditions. + + References + ---------- + .. [1] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121–133. + + """ + if weight_enum == 0: + # all weights = 1 + weights = None + elif weight_enum == 1: + # same as all weights = 1, but would cause issues if weights were + # incorrectly multiplied + weights = 2 * np.ones_like(self.y) + elif weight_enum == 2: + # binary mask, only fitting the first half of the data + weights = np.ones_like(self.y) + weights[self.x < 0.5 * (np.max(self.x) + np.min(self.x))] = 0 + else: + # weight array where the two endpoints have weighting >> 1 + weights = np.ones_like(self.y) + fraction = max(1, ceil(self.y.shape[0] * 0.1)) + weights[:fraction] = 100 + weights[-fraction:] = 100 + + poly_order = 2 + tol = 1e-3 + + poly_baseline = polynomial._Polynomial(self.x, self.z).poly( + self.y, poly_order, weights=weights + )[0] + penalized_poly_1 = self.class_func( + self.y, poly_order, cost_function='s_huber', + threshold=1e10, weights=weights + )[0] + + assert_allclose(poly_baseline, penalized_poly_1, 1e-10) + + modpoly_baseline = polynomial._Polynomial(self.x, self.z).modpoly( + self.y, poly_order, tol=tol, weights=weights, use_original=True + )[0] + penalized_poly_2 = self.class_func( + self.y, poly_order, cost_function='a_truncated_quadratic', + threshold=0, weights=weights, alpha_factor=1, tol=tol + )[0] + + assert_allclose(modpoly_baseline, penalized_poly_2, 1e-10) + + @pytest.mark.parametrize('alpha_factor', (-0.1, 0, 1.01)) + def test_wrong_alpha_factor_fails(self, alpha_factor): + """Ensures an alpha factor outside of (0, 1] fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, alpha_factor=alpha_factor) + + +class TestQuantReg(IterativePolynomialTester): + """Class for testing quant_reg baseline.""" + + func_name = 'quant_reg' + required_kwargs = {'tol': 1e-9} + + @pytest.mark.parametrize('quantile', (0, 1, -0.1, 1.1)) + def test_outside_quantile_fails(self, quantile): + """Ensures quantile values outside of (0, 1) raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, quantile=quantile) + + +class TestGoldindec(PolynomialTester): + """Class for testing goldindec baseline.""" + + func_name = 'goldindec' + checked_keys = ('weights', 'tol_history', 'threshold') + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize( + 'cost_function', + ( + 'asymmetric_truncated_quadratic', + 'a_truncated_quadratic', + 'asymmetric_huber', + 'asymmetric_indec', + 'indec', + 'huber', + 'truncated_quadratic' + ) + ) + def test_unchanged_data(self, new_instance, cost_function): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data(new_instance, cost_function=cost_function) + + @pytest.mark.parametrize('cost_function', ('p_huber', '')) + def test_unknown_cost_function_prefix_fails(self, cost_function): + """Ensures cost function with no prefix or a wrong prefix fails.""" + with pytest.raises(KeyError): + self.class_func(self.y, cost_function=cost_function) + + @pytest.mark.parametrize('cost_function', ('s_huber', 's_indec', 'symmetric_indec')) + def test_symmetric_cost_function_fails(self, cost_function): + """Ensures a symmetric cost function fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, cost_function=cost_function) + + def test_unknown_cost_function_fails(self): + """Ensures than an unknown cost function fails.""" + with pytest.raises(KeyError): + self.class_func(self.y, cost_function='a_hub') + + @pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) + def test_weighting(self, weight_enum): + """ + Tests that weighting is correctly applied by comparing to other algorithms. + + Weights were not included in the original goldindec method, so need to ensure + that their usage in pybaselines is correct. + + For uniform weights, the reference baseline is simply the unweighted calculation, + since they should be equal. For non-uniform weights, compare to the output of + penalized_poly, whose weighting is correctly tested, using the output optimal + threshold. + + """ + if weight_enum == 0: + # all weights = 1 + weights = None + uniform_weights = True + elif weight_enum == 1: + # same as all weights = 1, but would cause issues if weights were + # incorrectly multiplied + weights = np.full_like(self.y, 2) + uniform_weights = True + elif weight_enum == 2: + # binary mask, only fitting the first half of the data + weights = np.ones_like(self.y) + weights[self.x < 0.5 * (np.max(self.x) + np.min(self.x))] = 0 + uniform_weights = False + else: + # weight array where the two endpoints have weighting >> 1 + weights = np.ones_like(self.y) + fraction = max(1, ceil(self.y.shape[0] * 0.1)) + weights[:fraction] = 100 + weights[-fraction:] = 100 + uniform_weights = False + + poly_order = 2 + fit_baseline, params = self.class_func(self.y, poly_order=poly_order, weights=weights) + if uniform_weights: + reference_baseline = self.class_func(self.y, poly_order=poly_order)[0] + else: + reference_baseline = polynomial._Polynomial(self.x, self.z).penalized_poly( + self.y, poly_order=poly_order, weights=weights, + threshold=params['threshold'], cost_function='a_indec' + )[0] + + assert_allclose(fit_baseline, reference_baseline) + + @pytest.mark.parametrize('exit_enum', (0, 1, 2, 3)) + def test_tol_history(self, exit_enum): + """ + Ensures the 'tol_history' item in the parameter output is correct. + + Since the shape of 'tol_history' is dictated by the number of iterations + completed for fitting each threshold value and for iterating between + threshold values, need to ensure each exit criteria works independently. + + """ + if exit_enum == 0: + # inner fitting does more iterations + max_iter = 15 + tol = -1 + max_iter_2 = 10 + tol_2 = 0 + tol_3 = -1 + + expected_shape_0 = max_iter_2 + 2 + expected_shape_1 = max_iter + + if exit_enum == 1: + # outer fitting does more iterations + max_iter = 15 + tol = 1e6 + max_iter_2 = 10 + tol_2 = 0 + tol_3 = -1 + + expected_shape_0 = max_iter_2 + 2 + expected_shape_1 = max_iter_2 + + if exit_enum == 2: + # only one iteration completed; exits due to tol_2 + max_iter = 15 + tol = 1e6 + max_iter_2 = 10 + tol_2 = 1e6 + tol_3 = -1 + + expected_shape_0 = 3 + expected_shape_1 = 1 + + if exit_enum == 3: + # only one iteration completed; exits due to tol_3 + max_iter = 15 + tol = 1e6 + max_iter_2 = 10 + tol_2 = 0 + tol_3 = 1e6 + + expected_shape_0 = 3 + expected_shape_1 = 1 + + _, params = self.class_func( + self.y, max_iter=max_iter, tol=tol, max_iter_2=max_iter_2, + tol_2=tol_2, tol_3=tol_3 + ) + + assert params['tol_history'].shape[0] == expected_shape_0 + assert params['tol_history'].shape[1] == expected_shape_1 + + @pytest.mark.parametrize('alpha_factor', (-0.1, 0, 1.01)) + def test_wrong_alpha_factor_fails(self, alpha_factor): + """Ensures an alpha factor outside of (0, 1] fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, alpha_factor=alpha_factor) + + @pytest.mark.parametrize('peak_ratio', (-0.1, 0, 1, 1.01)) + def test_wrong_peak_ratio_fails(self, peak_ratio): + """Ensures a peak ratio outside of (0, 1) fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, peak_ratio=peak_ratio) + + From 88ffea513d58350937e482426d68a18634e0a532 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 13 Jan 2024 14:35:30 -0500 Subject: [PATCH 21/56] TEST: Finished tests for 2D Whittaker algorithms Also added the 2D versions of iasls, drpls, and aspls. --- pybaselines/two_d/_whittaker_utils.py | 35 ++- pybaselines/two_d/whittaker.py | 319 ++++++++++++++++++++++++-- tests/two_d/test_whittaker.py | 286 +++++++++++++++++++++++ 3 files changed, 612 insertions(+), 28 deletions(-) create mode 100644 tests/two_d/test_whittaker.py diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 133d4ee..cf6d398 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -39,16 +39,10 @@ class PenalizedSystem2D: set up. `original_diagonals` can be either the full or lower bands of the penalty, and may be reveresed, it depends on the set up. Reset by calling :meth:`.reset_diagonals`. - penalty : numpy.ndarray + penalty : scipy.sparse.base.spmatrix The current penalty. Originally is `original_diagonals` after multiplying by `lam` and applying padding, but can also be changed by calling :meth:`.add_penalty`. Reset by calling :meth:`.reset_diagonals`. - reversed : bool - If True, the penalty is reversed of the typical LAPACK banded format. Useful if - multiplying the penalty with an array since the rows get shifted, or if using pentapy's - solver. - using_pentapy : bool - If True, will use pentapy's solver when solving. """ @@ -68,7 +62,7 @@ def __init__(self, data_size, lam=1, diff_order=2): """ self._num_bases = data_size - self.reset_penalty(lam, diff_order) + self.reset_diagonals(lam, diff_order) def add_penalty(self, penalty): """ @@ -87,7 +81,7 @@ def add_penalty(self, penalty): """ raise NotImplementedError - def reset_penalty(self, lam=1, diff_order=2): + def reset_diagonals(self, lam=1, diff_order=2): """ Resets the diagonals of the system and all of the attributes. @@ -115,6 +109,7 @@ def reset_penalty(self, lam=1, diff_order=2): P1 = kron(self.lam[0] * D1.T @ D1, identity(self._num_bases[1])) P2 = kron(identity(self._num_bases[0]), self.lam[1] * D2.T @ D2) self.penalty = P1 + P2 + self.main_diagonal = self.penalty.diagonal() def solve(self, lhs, rhs): """ @@ -156,6 +151,28 @@ def solve(self, lhs, rhs): return output + def add_diagonal(self, array): + """ + Adds a diagonal array to the original penalty matrix. + + Parameters + ---------- + array : numpy.ndarray + The diagonal array to add to the penalty matrix. + + Returns + ------- + scipy.sparse.base.spmatrix + The penalty matrix with the main diagonal updated. + + """ + self.penalty.setdiag(self.main_diagonal + array) + return self.penalty + + def reset_diagonal(self): + """Sets the main diagonal of the penalty matrix back to its original value.""" + self.penalty.setdiag(self.main_diagonal) + def reverse_penalty(self): """ Reverses the penalty and original diagonals for the system. diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index 005461e..4350737 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -9,12 +9,15 @@ import warnings import numpy as np +from scipy.sparse import diags from .. import _weighting from ._algorithm_setup import _Algorithm2D +from ._whittaker_utils import PenalizedSystem2D from ..utils import ( ParameterWarning, relative_difference ) +from .._validation import _check_optional_array class _Whittaker(_Algorithm2D): @@ -81,12 +84,10 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh if not 0 < p < 1: raise ValueError('p must be between 0 and 1') y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y ) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) @@ -99,6 +100,101 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh return baseline, params + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) + def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, + weights=None, diff_order=2): + """ + Fits the baseline using the improved asymmetric least squares (IAsLS) algorithm. + + The algorithm consideres both the first and second derivatives of the residual. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with `N` data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + lam_1 : float, optional + The smoothing parameter for the first derivative of the residual. Default is 1e-4. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be set by fitting the data with a second order polynomial. + diff_order : int, optional + The order of the differential matrix. Must be greater than 1. Default is 2 + (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2. + + References + ---------- + He, S., et al. Baseline correction for raman spectra using an improved + asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + elif diff_order < 2: + raise ValueError('diff_order must be 2 or greater') + + if weights is None: + _, _, pseudo_inverse = self._setup_polynomial( + data, weights=None, poly_order=2, calc_vander=True, calc_pinv=True + ) + baseline = self.vandermonde @ (pseudo_inverse @ data.ravel()) + weights = _weighting._asls(data.ravel(), baseline, p).reshape(self._len) + + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + penalized_system_1.add_diagonal(weight_array * weight_array) + baseline = self.whittaker_system.solve( + self.whittaker_system.penalty + penalized_system_1.penalty, + penalized_system_1.penalty * y + ) + assert y.shape == (self._len[0] * self._len[1],) + assert baseline.shape == (self._len[0] * self._len[1],) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + @_Algorithm2D._register( sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True ) @@ -150,23 +246,21 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non data, lam, diff_order, weights, copy_weights=True ) y_l1_norm = np.abs(y).sum() - main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) # Have to have extensive error handling since the weights can all become # very small due to the exp(i) term if too many iterations are performed; # checking the negative residual length usually prevents any errors, but # sometimes not so have to also catch any errors from the solvers for i in range(1, max_iter + 2): - self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) try: output = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y ) except np.linalg.LinAlgError: warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -180,7 +274,7 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -247,12 +341,10 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None """ y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y ) new_weights = _weighting._arpls(y, baseline) calc_difference = relative_difference(weight_array, new_weights) @@ -265,6 +357,104 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None return baseline, params + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) + def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, diff_order=2): + """ + Doubly reweighted penalized least squares (drPLS) baseline. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e5. + eta : float + A term for controlling the value of lam; should be between 0 and 1. + Low values will produce smoother baselines, while higher values will + more aggressively fit peaks. Default is 0.5. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + diff_order : int, optional + The order of the differential matrix. Must be greater than 1. Default is 2 + (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `eta` is not between 0 and 1 or if `diff_order` is less than 2. + + References + ---------- + Xu, D. et al. Baseline correction method based on doubly reweighted + penalized least squares, Applied Optics, 2019, 58, 3913-3920. + + """ + if not 0 <= eta <= 1: + raise ValueError('eta must be between 0 and 1') + elif diff_order < 2: + raise ValueError('diff_order must be 2 or greater') + + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1) + # W + P_1 + (I - eta * W) @ P_n -> P_1 + P_n + W @ (I - eta * P_n) + partial_penalty = self.whittaker_system.penalty + penalized_system_1.penalty + partial_penalty_2 = -eta * self.whittaker_system.penalty + partial_penalty_2.setdiag(partial_penalty_2.diagonal() + 1) + weight_matrix = diags(weight_array) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = self.whittaker_system.solve( + partial_penalty + weight_matrix @ partial_penalty_2, weight_array * y, + ) + new_weights = _weighting._drpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + weight_matrix.setdiag(weight_array) + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + @_Algorithm2D._register( sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True ) @@ -314,12 +504,10 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non """ y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): - self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y ) new_weights = _weighting._iarpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) @@ -333,7 +521,8 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -344,6 +533,100 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non return baseline, params + @_Algorithm2D._register( + sort_keys=('weights', 'alpha'), reshape_keys=('weights', 'alpha'), reshape_baseline=True + ) + def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, + weights=None, alpha=None): + """ + Adaptive smoothness penalized least squares smoothing (asPLS). + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + alpha : array-like, shape (N,), optional + An array of values that control the local value of `lam` to better + fit peak and non-peak regions. If None (default), then the initial values + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'alpha': numpy.ndarray, shape (N,) + The array of alpha values used for fitting the data in the final iteration. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Notes + ----- + The weighting uses an asymmetric coefficient (`k` in the asPLS paper) of 0.5 instead + of the 2 listed in the asPLS paper. pybaselines uses the factor of 0.5 since it + matches the results in Table 2 and Figure 5 of the asPLS paper closer than the + factor of 2 and fits noisy data much better. + + References + ---------- + Zhang, F., et al. Baseline correction for infrared spectra using + adaptive smoothness parameter penalized least squares method. + Spectroscopy Letters, 2020, 53(3), 222-233. + + """ + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + alpha_array = _check_optional_array( + self._len, alpha, check_finite=self._check_finite, name='alpha', + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and alpha is not None: + alpha_array = alpha_array[self._sort_order] + + # use a sparse matrix to maintain sparsity after multiplication + alpha_matrix = diags(alpha_array.ravel(), format='csr') + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + lhs = alpha_matrix * self.whittaker_system.penalty + lhs.setdiag(lhs.diagonal() + weight_array) + baseline = self.whittaker_system.solve( + lhs, weight_array * y + ) + new_weights, residual = _weighting._aspls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + abs_d = np.abs(residual) + alpha_array = abs_d / abs_d.max() + + params = { + 'weights': weight_array, 'alpha': alpha_array, 'tol_history': tol_history[:i + 1] + } + + return baseline, params + @_Algorithm2D._register( sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True ) @@ -424,14 +707,12 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) if k is None: k = np.std(y) / 10 - main_diagonal = self.whittaker_system.penalty.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - self.whittaker_system.penalty.setdiag(main_diagonal + weight_array) baseline = self.whittaker_system.solve( - self.whittaker_system.penalty, weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y ) - new_weights = _weighting._psalsa(y, baseline, p, k, self._len) + new_weights = _weighting._psalsa(y, baseline, p, k, self._len[0] * self._len[1]) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference if calc_difference < tol: diff --git a/tests/two_d/test_whittaker.py b/tests/two_d/test_whittaker.py new file mode 100644 index 0000000..2e1cfb6 --- /dev/null +++ b/tests/two_d/test_whittaker.py @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.whittaker. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +import numpy as np +import pytest + +from pybaselines.two_d import whittaker +from pybaselines.utils import ParameterWarning + +from ..conftest import BaseTester2D, InputWeightsMixin + + +class WhittakerTester(BaseTester2D, InputWeightsMixin): + """Base testing class for whittaker functions.""" + + module = whittaker + algorithm_base = whittaker._Whittaker + checked_keys = ('weights', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestAsLS(WhittakerTester): + """Class for testing asls baseline.""" + + func_name = 'asls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + +class TestIAsLS(WhittakerTester): + """Class for testing iasls baseline.""" + + func_name = 'iasls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (2, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {2: 1e6, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=1) + + +class TestAirPLS(WhittakerTester): + """Class for testing airpls baseline.""" + + func_name = 'airpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e3, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when errors occur. + + When there are no negative residuals, which occurs when a low tol value is used with + a high max_iter value, the weighting function would produce values all ~0, which + can fail the solvers. The returned baseline should be the last iteration that was + successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline = self.class_func(y, tol=-1, max_iter=3000)[0] + + assert np.isfinite(baseline.T.dot(baseline).all()) + + +class TestArPLS(WhittakerTester): + """Class for testing arpls baseline.""" + + func_name = 'arpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + + assert np.isfinite(baseline.T.dot(baseline).all()) + + +class TestDrPLS(WhittakerTester): + """Class for testing drpls baseline.""" + + func_name = 'drpls' + + @pytest.mark.parametrize('eta', (-1, 2)) + def test_outside_eta_fails(self, eta): + """Ensures eta values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, eta=eta) + + @pytest.mark.parametrize('diff_order', (2, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {2: 1e5, 3: 1e9}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=1) + + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline, params = self.class_func(y, tol=-1, max_iter=1000) + + assert np.isfinite(baseline.T.dot(baseline).all()) + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + +class TestIArPLS(WhittakerTester): + """Class for testing iarpls baseline.""" + + func_name = 'iarpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline, params = self.class_func(y, tol=-1, max_iter=1000) + + assert np.isfinite(baseline.T.dot(baseline).all()) + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + +class TestAsPLS(WhittakerTester): + """Class for testing aspls baseline.""" + + func_name = 'aspls' + checked_keys = ('weights', 'alpha', 'tol_history') + weight_keys = ('weights', 'alpha') + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e4, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + @pytest.mark.parametrize('alpha_enum', (0, 1)) + def test_wrong_alpha_shape(self, alpha_enum): + """Ensures that an exception is raised if input alpha and data are different shapes.""" + if alpha_enum == 0: + alpha = np.ones(np.array(self.y.shape) + 1) + else: + alpha = np.ones(self.y.size) + with pytest.raises(ValueError): + self.class_func(self.y, alpha=alpha) + + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + + assert np.isfinite(baseline.T.dot(baseline).all()) + + +class TestPsalsa(WhittakerTester): + """Class for testing psalsa baseline.""" + + func_name = 'psalsa' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) From d77f8aecf6ee36749cff1b0eaf4d8c3300f8eba7 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 13 Jan 2024 15:02:52 -0500 Subject: [PATCH 22/56] MAINT: Fix validation.yxz_arrays yxz_arrays was not updated in the recent change that switched x and z to represent rows and columns of y. Added tests to ensure the output for the function is correct. --- pybaselines/_validation.py | 10 ++++---- tests/test_validation.py | 48 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index cc238ed..3745a5c 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -278,7 +278,7 @@ def _yx_arrays(data, x_data=None, check_finite=False, dtype=None, order=None, en def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, order=None, - ensure_2d=True, x_axis=-1, z_axis=-2): + ensure_2d=True, x_axis=-2, z_axis=-1): """ Converts input data into numpy arrays and provides x and z data if none are given. @@ -286,10 +286,10 @@ def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, ---------- data : array-like, shape (M, N) The y-values of the measured data, with N data points. - x_data : array-like, shape (N,), optional + x_data : array-like, shape (M,), optional The x-values of the measured data. Default is None, which will create an array from -1. to 1. with N points. - z_data : array-like, shape (M,), optional + z_data : array-like, shape (N,), optional The z-values of the measured data. Default is None, which will create an array from -1. to 1. with N points. check_finite : bool, optional @@ -308,9 +308,9 @@ def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, ------- y : numpy.ndarray, shape (M, N) A numpy array of the y-values of the measured data. - x : numpy.ndarray, shape (N,) + x : numpy.ndarray, shape (M,) A numpy array of the x-values of the measured data, or a created array. - z : numpy.ndarray, shape (M,) + z : numpy.ndarray, shape (N,) A numpy array of the z-values of the measured data, or a created array. Notes diff --git a/tests/test_validation.py b/tests/test_validation.py index 13d9c7f..360c82e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -7,7 +7,7 @@ """ import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_allclose, assert_array_equal import pytest from pybaselines import _validation @@ -34,7 +34,51 @@ def test_yx_arrays_no_x(small_data): y, x = _validation._yx_arrays(small_data) assert isinstance(x, np.ndarray) - assert_array_equal(x, np.linspace(-1., 1., y.shape[0])) + assert_allclose(x, np.linspace(-1., 1., y.shape[0]), rtol=1e-12, atol=1e-12) + assert isinstance(y, np.ndarray) + assert_allclose(y, small_data, rtol=1e-12, atol=1e-12) + +@pytest.mark.parametrize('array_enum', (0, 1)) +def test_yxz_arrays_output_array(data_fixture2d, array_enum): + """Ensures output y, x, and z are always numpy arrays and that x and z are not scaled.""" + x, z, y = data_fixture2d + if array_enum == 1: + x = x.tolist() + z = z.tolist() + y = y.tolist() + + y_out, x_out, z_out = _validation._yxz_arrays(y, x, z) + + assert isinstance(y_out, np.ndarray) + assert_allclose(y_out, y, rtol=1e-12, atol=1e-12) + assert isinstance(x_out, np.ndarray) + assert_allclose(x_out, x, rtol=1e-12, atol=1e-12) + assert isinstance(z_out, np.ndarray) + assert_allclose(z_out, z, rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('has_x', (True, False)) +@pytest.mark.parametrize('has_z', (True, False)) +def test_yx_arrays_no_xz(data_fixture2d, has_x, has_z): + """Ensures an x and/or z array are created if None is input.""" + x, z, y = data_fixture2d + if has_x: + expected_x = x + else: + x = None + expected_x = np.linspace(-1, 1, y.shape[0]) + if has_z: + expected_z = z + else: + z = None + expected_z = np.linspace(-1, 1, y.shape[1]) + y_out, x_out, z_out = _validation._yxz_arrays(y, x, z) + + assert_allclose(y_out, y) + assert isinstance(x_out, np.ndarray) + assert_allclose(x_out, expected_x, rtol=1e-12, atol=1e-12) + assert isinstance(z_out, np.ndarray) + assert_allclose(z_out, expected_z, rtol=1e-12, atol=1e-12) @pytest.mark.parametrize('ndim', (0, 1, 2)) From d9561cdeacec840a4b604434d079ea68c50f7b8d Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 13 Jan 2024 16:31:22 -0500 Subject: [PATCH 23/56] TEST: Add tests for 2D spline and morphological algorithms The pspline_arpls test is failing at avoiding non-nan values, so will need to check that later. Also added tests for the Baseline2D class. --- pybaselines/_validation.py | 4 +- pybaselines/two_d/morphological.py | 12 +- pybaselines/two_d/spline.py | 6 +- tests/two_d/test_api.py | 130 ++++++++++ tests/two_d/test_morphological.py | 74 ++++++ tests/two_d/test_spline.py | 394 +++++++++++++++++++++++++++++ tests/two_d/test_whittaker.py | 8 +- 7 files changed, 612 insertions(+), 16 deletions(-) create mode 100644 tests/two_d/test_api.py create mode 100644 tests/two_d/test_morphological.py create mode 100644 tests/two_d/test_spline.py diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index 3745a5c..e2211de 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -419,9 +419,7 @@ def _check_half_window(half_window, allow_zero=False, two_d=False): half_window, 2, fill_scalar=True, dtype=np.intp )[0] for val in output_half_window: - _check_scalar_variable( - val, allow_zero, 'half_window' - ) + _check_scalar_variable(val, allow_zero, 'half_window') else: output_half_window = _check_scalar_variable( half_window, allow_zero, 'half_window', dtype=np.intp diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index f7e67e6..764cc44 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -215,11 +215,11 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, """ y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) if smooth_half_window is None: - smooth_half_window = half_wind + smooth_half_window = half_wind # TODO need to do some verification on smooth_half_window if not None - rough_baseline = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + rough_baseline = grey_opening(y, 2 * half_wind + 1) baseline = uniform_filter( - rough_baseline, [2 * smooth_half_window + 1, 2 * smooth_half_window + 1] + rough_baseline, 2 * smooth_half_window + 1 ) return baseline, {'half_window': half_wind} @@ -311,8 +311,8 @@ def _avg_opening(y, half_window, opening=None): """ window_size = 2 * half_window + 1 if opening is None: - opening = grey_opening(y, [window_size, window_size]) + opening = grey_opening(y, window_size) return 0.5 * ( - grey_dilation(opening, [window_size, window_size]) - + grey_erosion(opening, [window_size, window_size]) + grey_dilation(opening, window_size) + + grey_erosion(opening, window_size) ) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 8e0137b..b79b342 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -269,7 +269,7 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, y, weight_array = self._setup_spline( data, weights, spline_degree, num_knots, True, diff_order, lam ) - old_coef = np.zeros(self.pspline._num_bases[0] * self.pspline._num_bases[1]) + old_coef = np.zeros((self.pspline._num_bases[0], self.pspline._num_bases[1])) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.pspline.solve_pspline(y, weight_array) @@ -727,7 +727,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.pspline.solve_pspline(y, weight_array) - new_weights = _weighting._psalsa(y, baseline, p, k, y.shape) # TODO replace y.shape with self._shape or whatever + new_weights = _weighting._psalsa(y, baseline, p, k, self._len) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference if calc_difference < tol: @@ -822,7 +822,7 @@ def _mapped_histogram(data, num_bins): # create zeros array outside of numba function since numba's implementation # of np.zeros is much slower than numpy's (https://github.com/numba/numba/issues/7259) histogram = np.zeros(num_bins) - bins, bin_mapping = _numba_mapped_histogram(data.flatten(), num_bins, histogram) + bins, bin_mapping = _numba_mapped_histogram(data.ravel(), num_bins, histogram) else: histogram, bins = np.histogram(data, num_bins, density=True) # leave out last bin edge to account for extra index; leave out first diff --git a/tests/two_d/test_api.py b/tests/two_d/test_api.py new file mode 100644 index 0000000..617483c --- /dev/null +++ b/tests/two_d/test_api.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.api. + +@author: Donald Erb +Created on July 3, 2021 + +""" + +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from pybaselines.two_d import ( + api, morphological, polynomial, smooth, spline, whittaker +) + +from ..conftest import get_data2d + + +_ALL_CLASSES = ( + morphological._Morphological, + polynomial._Polynomial, + smooth._Smooth, + spline._Spline, + whittaker._Whittaker +) + + +def get_public_methods(klass): + """ + Gets all public methods from a class. + + Parameters + ---------- + klass : class + The class to use. + + Returns + ------- + list[str, ...] + The list of all public methods of the input class. + + """ + return [method for method in dir(klass) if not method.startswith('_')] + + +# will be like [('asls', whittaker._Whittaker), ('modpoly', polynomial._Polynomial), ...] +_ALL_CLASSES_AND_METHODS = [] +for klass in _ALL_CLASSES: + for method in get_public_methods(klass): + _ALL_CLASSES_AND_METHODS.append((method, klass)) + + +class TestBaseline2D: + """Class for testing the Baseline2D class.""" + + algorithm_base = api.Baseline2D + + @classmethod + def setup_class(cls): + """Sets up the class for testing.""" + cls.x, cls.z, cls.y = get_data2d() + cls.algorithm = cls.algorithm_base(cls.x, cls.z, check_finite=False, assume_sorted=True) + + @classmethod + def teardown_class(cls): + """ + Resets class attributes after testing. + + Probably not needed, but done anyway to catch changes in how pytest works. + + """ + cls.x = None + cls.z = None + cls.y = None + cls.algorithm = None + + @pytest.mark.parametrize('method_and_class', _ALL_CLASSES_AND_METHODS) + def test_all_methods(self, method_and_class): + """Ensures all available methods work the same when accessing through Baseline class.""" + method, baseline_class = method_and_class + # collab_pls needs 2D input data + if method == 'collab_pls': + fit_data = np.vstack((self.y, self.y)) + else: + fit_data = self.y + + # need to handle some specific methods + if method == 'optimize_extended_range': + kwargs = {'method': 'modpoly'} + elif method == 'interp_pts': + kwargs = {'baseline_points': ((5, 10), (10, 20), (90, 100))} + elif method == 'golotvin': + # have to set kwargs for golotvin or else no baseline points are found + kwargs = {'half_window': 15, 'num_std': 6} + else: + kwargs = {} + + api_baseline, api_params = getattr(self.algorithm, method)(fit_data, **kwargs) + class_baseline, class_params = getattr( + baseline_class(self.x, self.z, check_finite=False, assume_sorted=True), method + )(fit_data, **kwargs) + + assert_allclose(api_baseline, class_baseline, rtol=1e-14, atol=1e-14) + assert len(api_params.keys()) == len(class_params.keys()) + for key, value in api_params.items(): + assert key in class_params + class_value = class_params[key] + if isinstance(value, (int, float, np.ndarray, list, tuple)): + assert_allclose(value, class_value, rtol=1e-14, atol=1e-14) + else: + assert value == class_value + + def test_method_availability(self): + """Ensures all public algorithms are available through the Baseline class.""" + total_methods_list = get_public_methods(api.Baseline2D) + total_methods = set(total_methods_list) + + # ensure no repeated methods + assert len(total_methods) == len(total_methods_list) + + for klass in _ALL_CLASSES: + assert issubclass(self.algorithm_base, klass) + class_methods = set(get_public_methods(klass)) + # all individual class methods should be in Baseline + assert len(class_methods - total_methods) == 0 + total_methods = total_methods - class_methods + + # no additional methods should be available + assert len(total_methods) == 0 diff --git a/tests/two_d/test_morphological.py b/tests/two_d/test_morphological.py new file mode 100644 index 0000000..33a9ba7 --- /dev/null +++ b/tests/two_d/test_morphological.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.morphological. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +import pytest + +from pybaselines.two_d import morphological + +from ..conftest import BaseTester2D + + +class MorphologicalTester(BaseTester2D): + """Base testing class for morphological functions.""" + + module = morphological + algorithm_base = morphological._Morphological + checked_keys = ('half_window',) + + +class IterativeMorphologicalTester(MorphologicalTester): + """Base testing class for iterative morphological functions.""" + + checked_keys = ('half_window', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestMor(MorphologicalTester): + """Class for testing mor baseline.""" + + func_name = 'mor' + + +class TestIMor(IterativeMorphologicalTester): + """Class for testing imor baseline.""" + + func_name = 'imor' + + +class TestRollingBall(MorphologicalTester): + """Class for testing rolling_ball baseline.""" + + func_name = 'rolling_ball' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('half_window', (None, 10, [10, 12])) + @pytest.mark.parametrize('smooth_half_window', (None, 0, 1)) + def test_unchanged_data(self, new_instance, half_window, smooth_half_window): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, half_window=half_window, smooth_half_window=smooth_half_window + ) + + @pytest.mark.parametrize('smooth_half_window', (None, 0, 10)) + def test_smooth_half_windows(self, smooth_half_window): + """Ensures smooth-half-window is correctly processed.""" + output = self.class_func(self.y, smooth_half_window=smooth_half_window) + + assert output[0].shape == self.y.shape + + +class TestTophat(MorphologicalTester): + """Class for testing tophat baseline.""" + + func_name = 'tophat' diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py new file mode 100644 index 0000000..f8700c9 --- /dev/null +++ b/tests/two_d/test_spline.py @@ -0,0 +1,394 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.splines. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +from unittest import mock + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest + +from pybaselines import utils +from pybaselines.two_d import spline, whittaker + +from ..conftest import BaseTester2D, InputWeightsMixin + + +@pytest.mark.parametrize('use_numba', (True, False)) +def test_mapped_histogram_simple(use_numba): + """Compares the output with numpy and the bin_mapping, testing corner cases.""" + num_bins = 10 + values = np.array([0, 0.01, 1, 1.5, 8, 9, 9.1, 10]) + expected_bin_edges = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=float) + expected_bin_mapping = np.array([0, 0, 1, 1, 8, 9, 9, 9], dtype=np.intp) + + np_histogram, np_bin_edges = np.histogram(values, num_bins, density=True) + assert_allclose(np_bin_edges, expected_bin_edges, rtol=0, atol=1e-12) + + with mock.patch.object(spline, '_HAS_NUMBA', use_numba): + histogram, bin_edges, bin_mapping = spline._mapped_histogram(values, num_bins) + + assert_allclose(histogram, np_histogram) + assert_allclose(bin_edges, np_bin_edges) + assert_array_equal(bin_mapping, expected_bin_mapping) + + +@pytest.mark.parametrize('rng_seed', (0, 1)) +@pytest.mark.parametrize('num_bins', (10, 100, 1000)) +@pytest.mark.parametrize('use_numba', (True, False)) +def test_mapped_histogram(rng_seed, num_bins, use_numba): + """Compares the output with numpy and the bin_mapping with a nieve version.""" + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + rng = np.random.RandomState(rng_seed) + values = rng.normal(0, 20, 1000) + np_histogram, np_bin_edges = np.histogram(values, num_bins, density=True) + with mock.patch.object(spline, '_HAS_NUMBA', use_numba): + histogram, bin_edges, bin_mapping = spline._mapped_histogram(values, num_bins) + + assert_allclose(histogram, np_histogram) + assert_allclose(bin_edges, np_bin_edges) + + expected_bin_mapping = np.zeros_like(values) + for i, left_bin in enumerate(bin_edges[:-1]): + mask = (values >= left_bin) & (values < bin_edges[i + 1]) + expected_bin_mapping[mask] = i + expected_bin_mapping[values >= bin_edges[-1]] = num_bins - 1 + + assert_array_equal(bin_mapping, expected_bin_mapping) + + +@pytest.mark.parametrize('fraction_pos', (0, 0.4)) +@pytest.mark.parametrize('fraction_neg', (0, 0.3)) +def test_mixture_pdf(fraction_pos, fraction_neg): + """Ensures the probability density function for the Gaussian-uniform mixture model is right.""" + x = np.linspace(-5, 10, 1000) + actual_sigma = 0.5 + sigma = np.log10(actual_sigma) + # the gaussian should be area-normalized, so set height accordingly + height = 1 / (actual_sigma * np.sqrt(2 * np.pi)) + expected_gaussian = utils.gaussian(x, height, 0, actual_sigma) + + fraction_gaus = 1 - fraction_pos - fraction_neg + if fraction_pos > 0: + pos_uniform = np.zeros_like(x) + pos_uniform[x >= 0] = 1 / abs(x.max()) + elif fraction_neg > 0: + pos_uniform = None + else: + pos_uniform = 0 + + if fraction_neg > 0: + neg_uniform = np.zeros_like(x) + neg_uniform[x <= 0] = 1 / abs(x.min()) + elif fraction_pos > 0: + neg_uniform = None + else: + neg_uniform = 0 + + output_pdf = spline._mixture_pdf( + x, fraction_gaus, sigma, fraction_pos, pos_uniform, neg_uniform + ) + + # now ensure neg_uniform and pos_uniform are not None + if pos_uniform is None: + pos_uniform = 0 + if neg_uniform is None: + neg_uniform = 0 + + expected_pdf = ( + fraction_gaus * expected_gaussian + + fraction_pos * pos_uniform + + fraction_neg * neg_uniform + ) + + assert_allclose(expected_pdf, output_pdf, 1e-12, 1e-12) + # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited + # by number of x-values + assert_allclose(1.0, np.trapz(output_pdf, x), 1e-3) + + +def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, + test_rtol=1e-6, test_atol=1e-12, **kwargs): + """ + Compares the output of the penalized spline (P-spline) versions of Whittaker functions. + + The number of knots for the P-splines are set to ``len(data) + 1`` and the spline + degree is set to 0; the result is that the spline basis becomes the identity matrix, + and the P-spline version should give the same output as the Whittaker version if + the weighting and linear systems were correctly set up. + + """ + whittaker_output = getattr( + whittaker._Whittaker(pspline_class.x, pspline_class.z), whittaker_func + )(data, lam=lam, **kwargs)[0] + + num_knots = np.array(data.shape) + 1 + if hasattr(pspline_class, 'class_func'): + spline_output = pspline_class.class_func( + data, lam=lam, num_knots=num_knots, spline_degree=0, **kwargs + )[0] + else: + spline_output = pspline_class._call_func( + data, lam=lam, num_knots=num_knots, spline_degree=0, **kwargs + )[0] + + assert_allclose(spline_output, whittaker_output, rtol=test_rtol, atol=test_atol) + + +class SplineTester(BaseTester2D): + """Base testing class for spline functions.""" + + module = spline + algorithm_base = spline._Spline + + +class IterativeSplineTester(SplineTester, InputWeightsMixin): + """Base testing class for iterative spline functions.""" + + checked_keys = ('weights', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestMixtureModel(IterativeSplineTester): + """Class for testing mixture_model baseline.""" + + func_name = 'mixture_model' + + @pytest.mark.parametrize('use_class', (True, False)) + @pytest.mark.parametrize('weight_bool', (True, False)) + def test_unchanged_data(self, use_class, weight_bool): + """Ensures that input data is unchanged by the function.""" + if weight_bool: + weights = np.ones_like(self.y) + else: + weights = None + super().test_unchanged_data(use_class, weights=weights) + + @pytest.mark.parametrize('symmetric', (False, True)) + def test_output(self, symmetric): + """Ensures that the output has the desired format.""" + initial_y = self.y + try: + if symmetric: + # make data with both positive and negative peaks; roll so peaks are not overlapping + self.y = np.roll(self.y, -50) - np.roll(self.y, 50) + p = 0.5 + else: + p = 0.01 + super().test_output(p=p, symmetric=symmetric) + finally: + self.y = initial_y + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 2: 1e5, 3: 1e8}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + +class TestIRSQR(IterativeSplineTester): + """Class for testing irsqr baseline.""" + + func_name = 'irsqr' + + @pytest.mark.parametrize('quantile', (-1, 2)) + def test_outside_p_fails(self, quantile): + """Ensures quantile values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, quantile=quantile) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 2: 1e5, 3: 1e8}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures the output is not affected by not having x or z values.""" + super().test_no_xz(has_x, has_z, rtol=1e-5, atol=1e-4) + + +class TestPsplineAsLS(IterativeSplineTester): + """Class for testing pspline_asls baseline.""" + + func_name = 'pspline_asls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + @pytest.mark.parametrize('lam', (1e1, 1e5)) + @pytest.mark.parametrize('p', (0.01, 0.1)) + def test_whittaker_comparison(self, lam, p): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'asls', self.y, lam=lam, p=p) + + +class TestPsplineAirPLS(IterativeSplineTester): + """Class for testing pspline_airpls baseline.""" + + func_name = 'pspline_airpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e3, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when errors occur. + + When there are no negative residuals, which occurs when a low tol value is used with + a high max_iter value, the weighting function would produce values all ~0, which + can fail the solvers. The returned baseline should be the last iteration that was + successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(utils.ParameterWarning): + baseline = self.class_func(y, tol=-1, max_iter=7000)[0] + assert np.isfinite(baseline.T.dot(baseline)).all() + + @pytest.mark.parametrize('lam', (1e1, 1e5)) + def test_whittaker_comparison(self, lam): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'airpls', self.y, lam=lam) + + +class TestPsplineArPLS(IterativeSplineTester): + """Class for testing pspline_arpls baseline.""" + + func_name = 'pspline_arpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + + assert np.isfinite(baseline.T.dot(baseline)).all() + + @pytest.mark.parametrize('lam', (1e1, 1e5)) + def test_whittaker_comparison(self, lam): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'arpls', self.y, lam=lam) + + +class TestPsplineIArPLS(IterativeSplineTester): + """Class for testing pspline_iarpls baseline.""" + + func_name = 'pspline_iarpls' + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(utils.ParameterWarning): + baseline, params = self.class_func(y, tol=-1, max_iter=1000) + + assert np.isfinite(baseline.T.dot(baseline)).all() + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + @pytest.mark.parametrize('lam', (1e1, 1e5)) + def test_whittaker_comparison(self, lam): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'iarpls', self.y, lam=lam) + + +class TestPsplinePsalsa(IterativeSplineTester): + """Class for testing pspline_psalsa baseline.""" + + func_name = 'pspline_psalsa' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3)) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + lam = {1: 1e2, 3: 1e10}[diff_order] + self.class_func(self.y, lam=lam, diff_order=diff_order) + + @pytest.mark.parametrize('lam', (1e1, 1e5)) + @pytest.mark.parametrize('p', (0.01, 0.1)) + def test_whittaker_comparison(self, lam, p): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'psalsa', self.y, lam=lam, p=p) + diff --git a/tests/two_d/test_whittaker.py b/tests/two_d/test_whittaker.py index 2e1cfb6..8b76935 100644 --- a/tests/two_d/test_whittaker.py +++ b/tests/two_d/test_whittaker.py @@ -104,7 +104,7 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): with pytest.warns(ParameterWarning): baseline = self.class_func(y, tol=-1, max_iter=3000)[0] - assert np.isfinite(baseline.T.dot(baseline).all()) + assert np.isfinite(baseline.T.dot(baseline)).all() class TestArPLS(WhittakerTester): @@ -134,7 +134,7 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): with np.errstate(over='raise'): baseline = self.class_func(y, tol=-1, max_iter=1000)[0] - assert np.isfinite(baseline.T.dot(baseline).all()) + assert np.isfinite(baseline.T.dot(baseline)).all() class TestDrPLS(WhittakerTester): @@ -220,7 +220,7 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): with pytest.warns(ParameterWarning): baseline, params = self.class_func(y, tol=-1, max_iter=1000) - assert np.isfinite(baseline.T.dot(baseline).all()) + assert np.isfinite(baseline.T.dot(baseline)).all() # ensure last tolerence calculation was non-finite as a double-check that # this test is actually doing what it should be doing assert not np.isfinite(params['tol_history'][-1]) @@ -265,7 +265,7 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): with np.errstate(over='raise'): baseline = self.class_func(y, tol=-1, max_iter=1000)[0] - assert np.isfinite(baseline.T.dot(baseline).all()) + assert np.isfinite(baseline.T.dot(baseline)).all() class TestPsalsa(WhittakerTester): From 8c74173bc99860323dbb378c678c1423a4f8467a Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:46:26 -0500 Subject: [PATCH 24/56] FEAT: Added 2D versions of collab_pls and adaptive_minmax --- pybaselines/morphological.py | 4 +- pybaselines/two_d/__init__.py | 68 ++++++ pybaselines/two_d/_algorithm_setup.py | 159 ++++++++++++- pybaselines/two_d/api.py | 3 +- pybaselines/two_d/classification.py | 13 + pybaselines/two_d/morphological.py | 4 +- pybaselines/two_d/optimizers.py | 331 ++++++++++++++++++++++++++ pybaselines/utils.py | 4 +- tests/two_d/test_algorithm_setup.py | 64 ++++- tests/two_d/test_api.py | 5 +- tests/two_d/test_optimizers.py | 238 ++++++++++++++++++ 11 files changed, 881 insertions(+), 12 deletions(-) create mode 100644 pybaselines/two_d/__init__.py create mode 100644 pybaselines/two_d/classification.py create mode 100644 pybaselines/two_d/optimizers.py create mode 100644 tests/two_d/test_optimizers.py diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index 95449d0..17d038f 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -110,13 +110,13 @@ def mpls(self, data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=1e-3, m indices = np.flatnonzero( ((diff[1:] == 0) | (diff[:-1] == 0)) & ((diff[1:] != 0) | (diff[:-1] != 0)) ) - w = np.full(y.shape[0], p) + w = np.full(self._len, p) # find the index of min(y) in the region between flat regions for previous_segment, next_segment in zip(indices[1::2], indices[2::2]): index = np.argmin(y[previous_segment:next_segment + 1]) + previous_segment w[index] = 1 - p - # have to invert the weight ordering the matching the original input y ordering + # have to invert the weight ordering to match the original input y ordering # since it will be sorted within _setup_whittaker w = _sort_array(w, self._inverted_order) diff --git a/pybaselines/two_d/__init__.py b/pybaselines/two_d/__init__.py new file mode 100644 index 0000000..856044f --- /dev/null +++ b/pybaselines/two_d/__init__.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +""" +Baseline Correction for Two Dimensional Data. +============================================= + +:mod:`pybaselines.two_d` provides the following algorithms for baseline correcting 2D data. + +* Polynomial methods (:mod:`pybaselines.two_d.polynomial`) + + * poly (Regular Polynomial) + * modpoly (Modified Polynomial) + * imodpoly (Improved Modified Polynomial) + * penalized_poly (Penalized Polynomial) + * quant_reg (Quantile Regression) + * goldindec (Goldindec Method) + +* Whittaker-smoothing-based methods (:mod:`pybaselines.two_d.whittaker`) + + * asls (Asymmetric Least Squares) + * iasls (Improved Asymmetric Least Squares) + * airpls (Adaptive Iteratively Reweighted Penalized Least Squares) + * arpls (Asymmetrically Reweighted Penalized Least Squares) + * drpls (Doubly Reweighted Penalized Least Squares) + * iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) + * aspls (Adaptive Smoothness Penalized Least Squares) + * psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) + +* Morphological methods (:mod:`pybaselines.two_d.morphological`) + + * mor (Morphological) + * imor (Improved Morphological) + * rolling_ball (Rolling Ball Baseline) + * mwmv (Moving Window Minimum Value) + * tophat (Top-hat Transformation) + +* Spline methods (:mod:`pybaselines.two_d.spline`) + + * mixture_model (Mixture Model) + * irsqr (Iterative Reweighted Spline Quantile Regression) + * pspline_asls (Penalized Spline Version of asls) + * pspline_iasls (Penalized Spline Version of iasls) + * pspline_airpls (Penalized Spline Version of airpls) + * pspline_arpls (Penalized Spline Version of arpls) + * pspline_drpls (Penalized Spline Version of drpls) + * pspline_iarpls (Penalized Spline Version of iarpls) + * pspline_aspls (Penalized Spline Version of aspls) + * pspline_psalsa (Penalized Spline Version of psalsa) + +* Smoothing-based methods (:mod:`pybaselines.two_d.smooth`) + + * noise_median (Noise Median method) + +* Baseline/Peak Classification methods (:mod:`pybaselines.two_d.classification`) + + * None yet + +* Optimizers (:mod:`pybaselines.two_d.optimizers`) + + * collab_pls (Collaborative Penalized Least Squares) + * adaptive_minmax (Adaptive MinMax) + + +@author: Donald Erb +Created on January 15, 2024 + +""" + +from .api import Baseline2D diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 15e9c7a..3b75f03 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -521,7 +521,8 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, # rearrange the vandermonde such that it matches the typical A c = b where b # is the flattened version of y and c are the coefficients self.vandermonde = np.polynomial.polynomial.polyvander2d( - mapped_x[:, None], mapped_z[None, :], [poly_orders[0], poly_orders[1]] + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), + [poly_orders[0], poly_orders[1]] ).reshape((-1, (poly_orders[0] + 1) * (poly_orders[1] + 1))) if max_cross is not None: @@ -714,6 +715,162 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): hw = _check_half_window(half_window, allow_zero, two_d=False) return pad_edges2d(y, hw, **pad_kwargs) + def _setup_classification(self, y, weights=None): + """ + Sets the starting parameters for doing classification algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,) + The weight array for the data, with boolean dtype. + + """ + weight_array = _check_optional_array( + self._len, weights, check_finite=self._check_finite, dtype=bool, + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + weight_array = weight_array.ravel() + + return y, weight_array + + def _get_function(self, method, modules): + """ + Tries to retrieve the indicated function from a list of modules. + + Parameters + ---------- + method : str + The string name of the desired function. Case does not matter. + modules : Sequence + A sequence of modules in which to look for the method. + + Returns + ------- + func : Callable + The corresponding function. + func_module : str + The module that `func` belongs to. + class_object : pybaselines.two_d_algorithm_setup._Algorithm2D + The `_Algorithm2D` object which will be used for fitting. + + Raises + ------ + AttributeError + Raised if no matching function is found within the modules. + + """ + function_string = method.lower() + for module in modules: + func_module = module.__name__.split('.')[-1] + module_class = getattr(module, '_' + func_module.capitalize()) + if hasattr(module_class, function_string): + # if self is a Baseline2D class, can just use its method + if hasattr(self, function_string): + func = getattr(self, function_string) + class_object = self + else: + # have to reset x and z ordering so that all outputs and parameters are + # correctly sorted + if self._sort_order is None: + x = self.x + z = self.z + assume_sorted = True + else: + assume_sorted = False + if isinstance(self._sort_order, tuple): + if self._sort_order[0] is Ellipsis: + x = self.x + z = self.z[self._inverted_order[1]] + else: + x = self.x[self._inverted_order[0][:, 0]] + z = self.z[self._inverted_order[1][0]] + else: + x = self.x[self._inverted_order] + z = self.z + + class_object = module_class( + x, z, check_finite=self._check_finite, assume_sorted=assume_sorted, + output_dtype=self._dtype + ) + func = getattr(class_object, function_string) + break + else: # in case no break + mod_names = [module.__name__ for module in modules] + raise AttributeError(( + f'unknown method "{method}" or method is not within the allowed ' + f'modules: {mod_names}' + )) + + return func, func_module, class_object + + def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=True, **kwargs): + """ + Sets the starting parameters for doing optimizer algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`._register`. + method : str + The string name of the desired function, like 'asls'. Case does not matter. + modules : Sequence(module, ...) + The modules to search for the indicated `method` function. + method_kwargs : dict, optional + A dictionary of keyword arguments to pass to the fitting function. Default + is None, which uses an emtpy dictionary. + copy_kwargs : bool, optional + If True (default), will copy the input `method_kwargs` so that the input + dictionary is not modified within the function. + **kwargs + Deprecated in version 0.8.0 and will be removed in version 0.10 or 1.0. Pass any + keyword arguments for the fitting function in the `method_kwargs` dictionary. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + baseline_func : Callable + The function for fitting the baseline. + func_module : str + The string name of the module that contained `fit_func`. + method_kws : dict + A dictionary of keyword arguments to pass to `fit_func`. + class_object : pybaselines._algorithm_setup._Algorithm + The `_Algorithm` object which will be used for fitting. + + Warns + ----- + DeprecationWarning + Passed if `kwargs` is not empty. + + """ + baseline_func, func_module, class_object = self._get_function(method, modules) + if method_kwargs is None: + method_kws = {} + elif copy_kwargs: + method_kws = method_kwargs.copy() + else: + method_kws = method_kwargs + + return ( + _sort_array2d(y, self._inverted_order), baseline_func, func_module, method_kws, + class_object + ) + def _setup_misc(self, y): """ Sets the starting parameters for doing miscellaneous algorithms. diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index 94e2644..d686356 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -7,6 +7,7 @@ """ from .morphological import _Morphological +from .optimizers import _Optimizers from .polynomial import _Polynomial from .smooth import _Smooth from .spline import _Spline @@ -14,7 +15,7 @@ class Baseline2D( - _Morphological, _Polynomial, _Smooth, _Spline, _Whittaker + _Morphological, _Optimizers, _Polynomial, _Smooth, _Spline, _Whittaker ): """ A class for all 2D baseline correction algorithms. diff --git a/pybaselines/two_d/classification.py b/pybaselines/two_d/classification.py new file mode 100644 index 0000000..ad45109 --- /dev/null +++ b/pybaselines/two_d/classification.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Techniques that rely on classifying peak and/or baseline segments for fitting baselines. + +Created on January 14, 2024 +@author: Donald Erb + +""" + +from ._algorithm_setup import _Algorithm2D + + +class _Classification(_Algorithm2D): + """A base class for all classification algorithms.""" diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index 764cc44..ef73c24 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -10,9 +10,7 @@ from scipy.ndimage import grey_dilation, grey_erosion, grey_opening, uniform_filter from ._algorithm_setup import _Algorithm2D -from ..utils import ( - relative_difference -) +from ..utils import relative_difference class _Morphological(_Algorithm2D): diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py new file mode 100644 index 0000000..99bcacd --- /dev/null +++ b/pybaselines/two_d/optimizers.py @@ -0,0 +1,331 @@ +# -*- coding: utf-8 -*- +"""High level functions for making better use of baseline algorithms. + +Functions in this module make use of other baseline algorithms in +pybaselines to provide better results or optimize parameters. + +Created on January 14, 2024 +@author: Donald Erb + +""" + +from math import ceil + +import numpy as np + +from . import classification, morphological, polynomial, spline, whittaker +from ._algorithm_setup import _Algorithm2D +from .._validation import _check_optional_array +from ..utils import _check_scalar, _sort_array2d + + +class _Optimizers(_Algorithm2D): + """A base class for all optimizer algorithms.""" + + @_Algorithm2D._register(ensure_2d=False) + def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): + """ + Collaborative Penalized Least Squares (collab-PLS). + + Averages the data or the fit weights for an entire dataset to get more + optimal results. Uses any Whittaker-smoothing-based or weighted spline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + An array with shape (M, N) where M is the number of entries in + the dataset and N is the number of data points in each entry. + average_dataset : bool, optional + If True (default) will average the dataset before fitting to get the + weighting. If False, will fit each individual entry in the dataset and + then average the weights to get the weighting for the dataset. + method : str, optional + A string indicating the Whittaker-smoothing-based or weighted spline method to + use for fitting the baseline. Default is 'asls'. + method_kwargs : dict, optional + A dictionary of keyword arguments to pass to the selected `method` function. + Default is None, which will use an empty dictionary. + + Returns + ------- + baselines : np.ndarray, shape (M, N) + An array of all of the baselines. + params : dict + A dictionary with the following items: + + * 'average_weights': numpy.ndarray, shape (N,) + The weight array used to fit all of the baselines. + * 'average_alpha': numpy.ndarray, shape (N,) + Only returned if `method` is 'aspls' or 'pspline_aspls'. The + `alpha` array used to fit all of the baselines for the + :meth:`~pybaselines.whittaker.Whittaker.aspls` or + :meth:`~pybaselines.spline.Spline.pspline_aspls` methods. + + Additional items depend on the output of the selected method. Every + other key will have a list of values, with each item corresponding to a + fit. + + Notes + ----- + If `method` is 'aspls' or 'pspline_aspls', `collab_pls` will also calculate + the `alpha` array for the entire dataset in the same manner as the weights. + + References + ---------- + Chen, L., et al. Collaborative Penalized Least Squares for Background + Correction of Multiple Raman Spectra. Journal of Analytical Methods + in Chemistry, 2018, 2018. + + """ + dataset, baseline_func, _, method_kws, _ = self._setup_optimizer( + data, method, (whittaker, morphological, classification, spline), method_kwargs, + True + ) + data_shape = dataset.shape + if len(data_shape) != 3: + raise ValueError(( + 'the input data must have a shape of (number of measurements, number of x points,' + f' number of y points), but instead has a shape of {data_shape}' + )) + method = method.lower() + # if using aspls or pspline_aspls, also need to calculate the alpha array + # for the entire dataset + calc_alpha = method in ('aspls', 'pspline_aspls') + + # step 1: calculate weights for the entire dataset + if average_dataset: + _, fit_params = baseline_func(np.mean(dataset, axis=0), **method_kws) + method_kws['weights'] = fit_params['weights'] + if calc_alpha: + method_kws['alpha'] = fit_params['alpha'] + else: + weights = np.empty(data_shape) + if calc_alpha: + alpha = np.empty(data_shape) + for i, entry in enumerate(dataset): + _, fit_params = baseline_func(entry, **method_kws) + weights[i] = fit_params['weights'] + if calc_alpha: + alpha[i] = fit_params['alpha'] + method_kws['weights'] = np.mean(weights, axis=0) + if calc_alpha: + method_kws['alpha'] = np.mean(alpha, axis=0) + + # step 2: use the dataset weights from step 1 (stored in method_kws['weights']) + # to fit each individual data entry; set tol to infinity so that only one + # iteration is done and new weights are not calculated + method_kws['tol'] = np.inf + baselines = np.empty(data_shape) + params = {'average_weights': method_kws['weights']} + if calc_alpha: + params['average_alpha'] = method_kws['alpha'] + if method == 'fabc': + # set weights as mask so it just fits the data + method_kws['weights_as_mask'] = True + + for i, entry in enumerate(dataset): + baselines[i], param = baseline_func(entry, **method_kws) + for key, value in param.items(): + if key in params: + params[key].append(value) + else: + params[key] = [value] + + return _sort_array2d(baselines, self._sort_order), params + + @_Algorithm2D._register + def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, + constrained_fraction=0.01, constrained_weight=1e5, + estimation_poly_order=2, method_kwargs=None): + """ + Fits polynomials of different orders and uses the maximum values as the baseline. + + Each polynomial order fit is done both unconstrained and constrained at the + endpoints. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + poly_order : int or Sequence(int, int) or None, optional + The two polynomial orders to use for fitting. If a single integer is given, + then will use the input value and one plus the input value. Default is None, + which will do a preliminary fit using a polynomial of order `estimation_poly_order` + and then select the appropriate polynomial orders according to [7]_. + method : {'modpoly', 'imodpoly'}, optional + The method to use for fitting each polynomial. Default is 'modpoly'. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + constrained_fraction : float or Sequence(float, float), optional + The fraction of points at the left and right edges to use for the + constrained fit. Default is 0.01. If `constrained_fraction` is a sequence, + the first item is the fraction for the left edge and the second is the + fraction for the right edge. + constrained_weight : float or Sequence(float, float), optional + The weighting to give to the endpoints. Higher values ensure that the + end points are fit, but can cause large fluctuations in the other sections + of the polynomial. Default is 1e5. If `constrained_weight` is a sequence, + the first item is the weight for the left edge and the second is the + weight for the right edge. + estimation_poly_order : int, optional + The polynomial order used for estimating the baseline-to-signal ratio + to select the appropriate polynomial orders if `poly_order` is None. + Default is 2. + method_kwargs : dict, optional + Additional keyword arguments to pass to + :meth:`~pybaselines.polynomial.Polynomial.modpoly` or + :meth:`~pybaselines.polynomial.Polynomial.imodpoly`. These include + `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. + + Returns + ------- + numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'constrained_weights': numpy.ndarray, shape (N,) + The weight array used for the endpoint-constrained fits. + * 'poly_order': numpy.ndarray, shape (2,) + An array of the two polynomial orders used for the fitting. + + References + ---------- + .. [7] Cao, A., et al. A robust method for automated background subtraction + of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, + 1199-1205. + + """ + y, baseline_func, _, method_kws, _ = self._setup_optimizer( + data, method, [polynomial], method_kwargs, False + ) + sort_weights = weights is not None + weight_array = _check_optional_array( + self._len, weights, check_finite=self._check_finite, ensure_1d=False, axis=slice(None) + ) + if poly_order is None: + poly_orders = _determine_polyorders( + y, estimation_poly_order, weight_array, baseline_func, **method_kws + ) + else: + poly_orders, scalar_poly_order = _check_scalar(poly_order, 2, True, dtype=int) + if scalar_poly_order: + poly_orders[1] += 1 # add 1 since they are initially equal if scalar input + + # use high weighting rather than Lagrange multipliers to constrain the points + # to better work with noisy data + # allow either 4 or 2 inputs for constrained weight and fraction + try: + weightings = _check_scalar(constrained_weight, 4, True)[0] + except ValueError: + weightings = _check_scalar(constrained_weight, 2, True)[0] + weightings = np.array([weightings[0], weightings[0], weightings[1], weightings[1]]) + try: + constrained_fractions = _check_scalar(constrained_fraction, 4, True)[0] + except ValueError: + constrained_fractions = _check_scalar(constrained_fraction, 2, True)[0] + constrained_fractions = np.array([ + constrained_fractions[0], constrained_fractions[0], + constrained_fractions[1], constrained_fractions[1] + ]) + if np.any(constrained_fractions < 0) or np.any(constrained_fractions > 1): + raise ValueError('constrained_fraction must be between 0 and 1') + + # have to temporarily sort weights to match x- and y-ordering so that left and right edges + # are correct + if sort_weights: + weight_array = _sort_array2d(weight_array, self._sort_order) + + constrained_weights = weight_array.copy() + constrained_weights[:ceil(self._len[0] * constrained_fractions[0])] = weightings[0] + constrained_weights[:, :ceil(self._len[1] * constrained_fractions[2])] = weightings[2] + constrained_weights[ + self._len[0] - ceil(self._len[0] * constrained_fractions[1]): + ] = weightings[1] + constrained_weights[ + :, self._len[1] - ceil(self._len[1] * constrained_fractions[3]): + ] = weightings[3] + # and now change back to original ordering + if sort_weights: + weight_array = _sort_array2d(weight_array, self._inverted_order) + constrained_weights = _sort_array2d(constrained_weights, self._inverted_order) + + # TODO should make parameters available; a list with an item for each fit like collab_pls + # TODO could maybe just use itertools.permutations, but would want to know the order in + # which the parameters are used + baselines = np.empty((4, *self._len)) + baselines[0] = baseline_func( + data=y, poly_order=poly_orders[0], weights=weight_array, **method_kws + )[0] + baselines[1] = baseline_func( + data=y, poly_order=poly_orders[0], weights=constrained_weights, **method_kws + )[0] + baselines[2] = baseline_func( + data=y, poly_order=poly_orders[1], weights=weight_array, **method_kws + )[0] + baselines[3] = baseline_func( + data=y, poly_order=poly_orders[1], weights=constrained_weights, **method_kws + )[0] + + # TODO should the coefficients also be made available? Would need to get them from + # each of the fits + params = { + 'weights': weight_array, 'constrained_weights': constrained_weights, + 'poly_order': poly_orders + } + + return _sort_array2d(np.maximum.reduce(baselines), self._sort_order), params + + +def _determine_polyorders(y, poly_order, weights, fit_function, **fit_kwargs): + """ + Selects the appropriate polynomial orders based on the baseline-to-signal ratio. + + Parameters + ---------- + y : numpy.ndarray + The array of y-values. + poly_order : int + The polynomial order for fitting. + weights : numpy.ndarray + The weight array for fitting. + fit_function : Callable + The function to use for the polynomial fit. + **fit_kwargs + Additional keyword arguments to pass to `fit_function`. + + Returns + ------- + orders : numpy.ndarray, shape (2,) + The two polynomial orders to use based on the baseline to signal + ratio according to the reference. + + References + ---------- + Cao, A., et al. A robust method for automated background subtraction + of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. + + """ + baseline = fit_function(y, poly_order=poly_order, weights=weights, **fit_kwargs)[0] + signal = y - baseline + baseline_to_signal = (baseline.max() - baseline.min()) / (signal.max() - signal.min()) + # Table 2 in reference # TODO in 2D does this need changed? + if baseline_to_signal < 0.2: + orders = (1, 2) + elif baseline_to_signal < 0.75: + orders = (2, 3) + elif baseline_to_signal < 8.5: + orders = (3, 4) + elif baseline_to_signal < 55: + orders = (4, 5) + elif baseline_to_signal < 240: + orders = (5, 6) + elif baseline_to_signal < 517: + orders = (6, 7) + else: + orders = (6, 8) # not a typo, use 6 and 8 rather than 7 and 8 + + return np.array(orders) diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 14defe7..10fd6eb 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -308,11 +308,11 @@ def pad_edges(data, pad_length, mode='extrapolate', return padded_data -def pad_edges2d(data, pad_length, *args, **kwargs): +def pad_edges2d(data, pad_length, *args, mode='edge', **kwargs): if not _check_scalar(pad_length, None)[1]: raise NotImplementedError('separate pad lengths not yet supported') else: - return pad_edges(data, pad_length, *args, **kwargs) + return pad_edges(data, pad_length, *args, mode=mode, **kwargs) def padded_convolve(data, kernel, mode='reflect', **pad_kwargs): diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index e1e7c2b..7f548cd 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -11,7 +11,7 @@ import pytest from scipy.sparse import identity, kron -from pybaselines.two_d import _algorithm_setup +from pybaselines.two_d import _algorithm_setup, optimizers, polynomial, whittaker from pybaselines.utils import ParameterWarning, difference_matrix from ..conftest import get_data2d, get_2dspline_inputs @@ -672,3 +672,65 @@ def test_override_x(algorithm): assert new_algorithm.vandermonde is None assert new_algorithm.whittaker_system is None assert new_algorithm.pspline is None + + +@pytest.mark.parametrize( + 'method_and_outputs', ( + ('collab_pls', 'collab_pls', 'optimizers'), + ('COLLAB_pls', 'collab_pls', 'optimizers'), + ('modpoly', 'modpoly', 'polynomial'), + ('asls', 'asls', 'whittaker') + ) +) +def test_get_function(algorithm, method_and_outputs): + """Ensures _get_function gets the correct method, regardless of case.""" + method, expected_func, expected_module = method_and_outputs + tested_modules = [optimizers, polynomial, whittaker] + selected_func, module, class_object = algorithm._get_function( + method, tested_modules + ) + assert selected_func.__name__ == expected_func + assert module == expected_module + assert isinstance(class_object, _algorithm_setup._Algorithm2D) + + +def test_get_function_fails_wrong_method(algorithm): + """Ensures _get_function fails when an no function with the input name is available.""" + with pytest.raises(AttributeError): + algorithm._get_function('unknown function', [optimizers]) + + +def test_get_function_fails_no_module(algorithm): + """Ensures _get_function fails when not given any modules to search.""" + with pytest.raises(AttributeError): + algorithm._get_function('collab_pls', []) + + +@pytest.mark.parametrize('method_kwargs', (None, {'a': 2})) +def test_setup_optimizer(small_data2d, algorithm, method_kwargs): + """Ensures output of _setup_optimizer is correct.""" + y, fit_func, func_module, output_kwargs, class_object = algorithm._setup_optimizer( + small_data2d, 'asls', [whittaker], method_kwargs + ) + + assert isinstance(y, np.ndarray) + assert_allclose(y, small_data2d) + assert fit_func.__name__ == 'asls' + assert func_module == 'whittaker' + assert isinstance(output_kwargs, dict) + assert isinstance(class_object, _algorithm_setup._Algorithm2D) + + +@pytest.mark.parametrize('copy_kwargs', (True, False)) +def test_setup_optimizer_copy_kwargs(small_data2d, algorithm, copy_kwargs): + """Ensures the copy behavior of the input keyword argument dictionary.""" + input_kwargs = {'a': 1} + y, _, _, output_kwargs, _ = algorithm._setup_optimizer( + small_data2d, 'asls', [whittaker], input_kwargs, copy_kwargs + ) + + output_kwargs['a'] = 2 + if copy_kwargs: + assert input_kwargs['a'] == 1 + else: + assert input_kwargs['a'] == 2 diff --git a/tests/two_d/test_api.py b/tests/two_d/test_api.py index 617483c..63aebc5 100644 --- a/tests/two_d/test_api.py +++ b/tests/two_d/test_api.py @@ -11,7 +11,7 @@ import pytest from pybaselines.two_d import ( - api, morphological, polynomial, smooth, spline, whittaker + api, morphological, optimizers, polynomial, smooth, spline, whittaker ) from ..conftest import get_data2d @@ -19,6 +19,7 @@ _ALL_CLASSES = ( morphological._Morphological, + optimizers._Optimizers, polynomial._Polynomial, smooth._Smooth, spline._Spline, @@ -81,7 +82,7 @@ def test_all_methods(self, method_and_class): method, baseline_class = method_and_class # collab_pls needs 2D input data if method == 'collab_pls': - fit_data = np.vstack((self.y, self.y)) + fit_data = np.array((self.y, self.y)) else: fit_data = self.y diff --git a/tests/two_d/test_optimizers.py b/tests/two_d/test_optimizers.py new file mode 100644 index 0000000..2a03da8 --- /dev/null +++ b/tests/two_d/test_optimizers.py @@ -0,0 +1,238 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.optimizers. + +@author: Donald Erb +Created on January 14, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest + +from pybaselines import utils +from pybaselines.two_d import optimizers, polynomial + +from ..conftest import BaseTester2D, InputWeightsMixin + + +class OptimizerInputWeightsMixin(InputWeightsMixin): + """Passes weights within the `method_kwargs` dictionary.""" + + def test_input_weights(self, assertion_kwargs=None, **kwargs): + """ + Ensures arrays are correctly sorted within the function. + + Returns the output for further testing. + + """ + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, self.y.shape[-2:]) + weights = np.clip(weights, 0, 1).astype(float, copy=False) + + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + + regular_output, regular_output_params = self.class_func( + data=self.y, method_kwargs={'weights': weights}, **self.kwargs, **kwargs + ) + reverse_output, reverse_output_params = getattr(reverse_fitter, self.func_name)( + data=self.reverse_array(self.y), method_kwargs={'weights': self.reverse_array(weights)}, + **self.kwargs, **kwargs + ) + + if assertion_kwargs is None: + assertion_kwargs = {} + if 'rtol' not in assertion_kwargs: + assertion_kwargs['rtol'] = 1e-10 + if 'atol' not in assertion_kwargs: + assertion_kwargs['atol'] = 1e-14 + + for key in self.weight_keys: + assert_allclose( + regular_output_params[key], self.reverse_array(reverse_output_params[key]), + **assertion_kwargs + ) + assert_allclose( + regular_output, self.reverse_array(reverse_output), **assertion_kwargs + ) + + return regular_output, regular_output_params, reverse_output, reverse_output_params + + +class OptimizersTester(BaseTester2D): + """Base testing class for optimizer functions.""" + + module = optimizers + algorithm_base = optimizers._Optimizers + + +class TestCollabPLS(OptimizersTester, OptimizerInputWeightsMixin): + """Class for testing collab_pls baseline.""" + + func_name = "collab_pls" + # will need to change checked_keys if default method is changed + checked_keys = ('average_weights', 'weights', 'tol_history') + three_d = True + weight_keys = ('average_weights',) + + @pytest.mark.parametrize( + 'method', + ( + 'asls', 'iasls', 'airpls', 'arpls', 'drpls', 'iarpls', 'aspls', 'psalsa', + 'mixture_model', 'irsqr', 'pspline_asls', + 'pspline_airpls', 'pspline_arpls', + 'pspline_iarpls', 'pspline_psalsa', + ) + ) + def test_all_methods(self, method): + """Ensures all available methods work.""" + self.class_func(self.y, method=method) + + def test_unknown_method_fails(self): + """Ensures function fails when an unknown function is given.""" + with pytest.raises(AttributeError): + self.class_func(self.y, method='unknown function') + + def test_single_dataset_fails(self): + """Ensures an error is raised if the input has the shape (M, N).""" + with pytest.raises(ValueError, match='the input data must'): + self.class_func(np.arange(self.y[0].size).reshape(self.y.shape[-2:])) + + @pytest.mark.parametrize('average_dataset', (True, False)) + def test_input_weights(self, average_dataset): + """Ensures the input weights are sorted correctly.""" + output = super().test_input_weights(average_dataset=average_dataset) + regular_output, regular_output_params, reverse_output, reverse_output_params = output + + assert_allclose( + regular_output_params['weights'], + self.reverse_array(np.asarray(reverse_output_params['weights'])), + rtol=1e-12, atol=1e-14 + ) + + +@pytest.mark.parametrize( + 'baseline_ptp', (0.01, 0.1, 0.3, 0.5, 1, 5, 10, 40, 100, 200, 300, 500, 600, 1000) +) +def test_determine_polyorders(baseline_ptp): + """Ensures the correct polynomials are selected based on the signal to baseline ratio.""" + x = np.linspace(0, 100, 500) + z = np.linspace(0, 100, 400) + X, Z = np.meshgrid(x, z, indexing='ij') + # set y such that max(y) - min(y) is ~ 1 so that + # ptp(baseline) / ptp(y) ~= ptp(baseline) + y = ( + utils.gaussian2d(X, Z, 1, 25, 25, 2, 2) + + utils.gaussian2d(X, Z, 0.5, 50, 50, 2, 2) + + utils.gaussian2d(X, Z, 1, 75, 75, 2, 2) + ) + # use a linear baseline so that it's easy to set the peak-to-peak of the baseline + true_baseline = X * baseline_ptp / (x.max() - x.min()) + + # double check to make sure the system is setup as expected + assert_allclose(np.ptp(true_baseline), baseline_ptp, 0, 1e-3) + assert_allclose(np.ptp(y), 1, 0, 1e-3) + + fitter = polynomial._Polynomial(x, z, check_finite=False, assume_sorted=True) + + fit_baseline = fitter.modpoly(y + true_baseline, poly_order=1)[0] + # sanity check to make sure internal baseline fit was correct + assert_allclose(np.ptp(fit_baseline), baseline_ptp, 0, 5e-3) + + if baseline_ptp < 0.2: + expected_orders = (1, 2) + elif baseline_ptp < 0.75: + expected_orders = (2, 3) + elif baseline_ptp < 8.5: + expected_orders = (3, 4) + elif baseline_ptp < 55: + expected_orders = (4, 5) + elif baseline_ptp < 240: + expected_orders = (5, 6) + elif baseline_ptp < 517: + expected_orders = (6, 7) + else: + expected_orders = (6, 8) + + output_orders = optimizers._determine_polyorders( + y + true_baseline, poly_order=1, weights=None, fit_function=fitter.modpoly + ) + + assert_array_equal(output_orders, expected_orders) + + +class TestAdaptiveMinMax(OptimizersTester, InputWeightsMixin): + """Class for testing adaptive_minmax baseline.""" + + func_name = 'adaptive_minmax' + checked_keys = ('weights', 'constrained_weights', 'poly_order') + weight_keys = ('weights', 'constrained_weights') + + @pytest.mark.parametrize('method', ('modpoly', 'imodpoly')) + def test_methods(self, method): + """Ensures all available methods work.""" + self.class_func(self.y, method=method) + + def test_unknown_method_fails(self): + """Ensures function fails when an unknown function is given.""" + with pytest.raises(AttributeError): + self.class_func(self.y, method='unknown') + + @pytest.mark.parametrize('poly_order', (None, 0, [0], (0, 1))) + def test_polyorder_inputs(self, poly_order): + """Tests valid inputs for poly_order.""" + self.class_func(self.y, poly_order) + + @pytest.mark.parametrize('poly_order', (0, [0], (0, 1))) + def test_polyorder_outputs(self, poly_order): + """Ensures that the correct polynomial orders were used.""" + _, params = self.class_func(self.y, poly_order) + assert_array_equal(params['poly_order'], np.array([0, 1])) + + @pytest.mark.parametrize('poly_order', ([0, 1, 2], (0, 1, 2, 3))) + def test_too_many_polyorders_fails(self, poly_order): + """Ensures an error is raised if poly_order has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, poly_order) + + @pytest.mark.parametrize( + 'constrained_fraction', (0.01, [0.01], (0, 0.01), [0.01, 1], [0.01, 0.01, 0.01, 0.01]) + ) + def test_constrained_fraction_inputs(self, constrained_fraction): + """Tests valid inputs for constrained_fraction.""" + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize( + 'constrained_fraction', ([0.01, 0.02, 0.02], (0.01, 0.01, 0.01, 0.01, 0.01)) + ) + def test_too_many_constrained_fraction(self, constrained_fraction): + """Ensures an error is raised if constrained_fraction has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize('constrained_fraction', (-0.5, [-0.01, 0.02], 1.1, [0.05, 1.1])) + def test_invalid_constrained_fraction(self, constrained_fraction): + """Ensures an error is raised if constrained_fraction is outside of [0, 1].""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize('constrained_weight', (1e5, [1e5], (1e3, 1e5), [1e3, 1e3, 1e3, 1e3])) + def test_constrained_weight_inputs(self, constrained_weight): + """Tests valid inputs for constrained_weight.""" + self.class_func(self.y, constrained_weight=constrained_weight) + + @pytest.mark.parametrize('constrained_weight', ([1e4, 1e2, 1e5], (1e3, 1e3, 1e3, 1e3, 1e3))) + def test_too_many_constrained_weight(self, constrained_weight): + """Ensures an error is raised if constrained_weight has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_weight=constrained_weight) + + def test_input_weights(self): + """Ensures the input weights are sorted correctly.""" + # use different weightings and constrained fractions for left and right + # sides that that if weights are reversed, there is a clear difference + weightings = np.array([1e4, 1e5, 1e4, 1e5]) + constrained_fractions = np.array([0.01, 0.02, 0.01, 0.02]) + super().test_input_weights( + constrained_weight=weightings, constrained_fraction=constrained_fractions + ) From 5765527953de2d933c7664633ad61ee8f6d1e633 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:47:36 -0500 Subject: [PATCH 25/56] FEAT: Add 2D version of noise_median Still need to work out minor details for it. --- pybaselines/two_d/smooth.py | 67 +++++++++++++++++++++++++++++++++++-- tests/two_d/test_smooth.py | 38 +++++++++++++++++++++ 2 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 tests/two_d/test_smooth.py diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py index 1e5517a..b0656e7 100644 --- a/pybaselines/two_d/smooth.py +++ b/pybaselines/two_d/smooth.py @@ -6,5 +6,68 @@ """ -class _Smooth: - pass \ No newline at end of file +from scipy.ndimage import gaussian_filter, median_filter + +from ._algorithm_setup import _Algorithm2D, _optimize_window + +class _Smooth(_Algorithm2D): + """A base class for all smoothing algorithms.""" + + @_Algorithm2D._register + def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=None, + **pad_kwargs): + """ + The noise-median method for baseline identification. + + Assumes the baseline can be considered as the median value within a moving + window, and the resulting baseline is then smoothed with a Gaussian kernel. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. + half_window : int, optional + The index-based size to use for the median window. The total window + size will range from [-half_window, ..., half_window] with size + 2 * half_window + 1. Default is None, which will use twice the output from + :func:`.optimize_window`, which is an okay starting value. + smooth_half_window : int, optional + The half window to use for smoothing. Default is None, which will use + the same value as `half_window`. + sigma : float, optional + The standard deviation of the smoothing Gaussian kernel. Default is None, + which will use (2 * `smooth_half_window` + 1) / 6. + **pad_kwargs + Additional keyword arguments to pass to :func:`.pad_edges` for padding + the edges of the data to prevent edge effects from convolution. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated and smoothed baseline. + dict + An empty dictionary, just to match the output of all other algorithms. + + References + ---------- + Friedrichs, M., A model-free algorithm for the removal of baseline + artifacts. J. Biomolecular NMR, 1995, 5, 147-153. + + """ + if half_window is None: + half_window = 2 * _optimize_window(data) + window_size = 2 * half_window + 1 + median = median_filter( + self._setup_smooth(data, half_window, **pad_kwargs), + [window_size, window_size], mode='nearest' + ) + if smooth_half_window is None: + smooth_window = window_size + else: + smooth_window = 2 * smooth_half_window + 1 + if sigma is None: + # the gaussian kernel will includes +- 3 sigma + sigma = smooth_window / 6 + + baseline = gaussian_filter(median, sigma, truncate=smooth_window) # TODO check truncate value + return baseline[half_window:-half_window, half_window:-half_window], {} diff --git a/tests/two_d/test_smooth.py b/tests/two_d/test_smooth.py new file mode 100644 index 0000000..47d262b --- /dev/null +++ b/tests/two_d/test_smooth.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.smooth. + +@author: Donald Erb +Created on January 14, 2024 + +""" + +import pytest + +from pybaselines.two_d import smooth + +from ..conftest import BaseTester2D + + +class SmoothTester(BaseTester2D): + """Base testing class for whittaker functions.""" + + module = smooth + algorithm_base = smooth._Smooth + + +class TestNoiseMedian(SmoothTester): + """Class for testing noise median baseline.""" + + func_name = 'noise_median' + required_kwargs = {'half_window': 15} + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('smooth_hw', (None, 0, 2)) + def test_unchanged_data(self, new_instance, smooth_hw): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data(new_instance, smooth_half_window=smooth_hw) + + @pytest.mark.parametrize('half_window', (None, 15)) + def test_half_windows(self, half_window): + """Tests possible inputs for `half_window`.""" + self.class_func(self.y, half_window=half_window) From a793bd5539d6a92e01dae8b7116288cd8264fc6d Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 20:51:47 -0500 Subject: [PATCH 26/56] TEST: Add comparison test to statsmodels for 2D quantile regression Bumped required numpy to 1.17 in order to use default_rng; will update all the other places with rng in the dev branch after merging this branch into it. --- setup.cfg | 2 +- setup.py | 2 +- tests/data.py | 474 +++++++++++++++++++++++++++++++++ tests/test_polynomial.py | 2 +- tests/two_d/test_polynomial.py | 59 ++++ 5 files changed, 536 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index b9f72cd..9b476ca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,7 +47,7 @@ packages = find: include_package_data = True python_requires = >=3.6 install_requires = - numpy>=1.14 + numpy>=1.17 scipy>=1.0 zip_safe = False diff --git a/setup.py b/setup.py index e586bf5..11cc841 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ Notes on minimum required versions for dependencies: -numpy: >= 1.14 in order to use rcond=None with numpy.linalg.lstsq +numpy: >= 1.17 in order to use numpy.random.default_rng scipy: >= 1.0 to use the blas function gbmv for banded matrix-vector dot product pentapy: >= 1.0 to use solver #2 numba: >= 0.45 in order to cache jit-ed functions with parallel=True diff --git a/tests/data.py b/tests/data.py index 4257787..503ce91 100644 --- a/tests/data.py +++ b/tests/data.py @@ -459,6 +459,480 @@ ]) } +STATSMODELS_QUANTILES_2D = { + 0.1: np.array([ + 1.82859650e+03, 1.65429084e+03, 1.47998518e+03, 1.30567952e+03, 1.13137386e+03, + 9.57068206e+02, 7.82762547e+02, 6.08456888e+02, 4.34151229e+02, 2.59845570e+02, + 8.55399116e+01, -8.87657472e+01, -2.63071406e+02, -4.37377065e+02, -6.11682724e+02, + -7.85988382e+02, -9.60294041e+02, -1.13459970e+03, -1.30890536e+03, -1.48321102e+03, + -1.65751668e+03, -1.83182234e+03, -2.00612799e+03, -2.18043365e+03, -2.35473931e+03, + -2.52904497e+03, -2.70335063e+03, -2.87765629e+03, -3.05196195e+03, -3.22626761e+03, + -3.40057326e+03, 1.66246682e+03, 1.50205814e+03, 1.34164945e+03, 1.18124077e+03, + 1.02083208e+03, 8.60423398e+02, 7.00014714e+02, 5.39606029e+02, 3.79197345e+02, + 2.18788660e+02, 5.83799758e+01, -1.02028709e+02, -2.62437393e+02, -4.22846078e+02, + -5.83254762e+02, -7.43663447e+02, -9.04072131e+02, -1.06448082e+03, -1.22488950e+03, + -1.38529818e+03, -1.54570687e+03, -1.70611555e+03, -1.86652424e+03, -2.02693292e+03, + -2.18734161e+03, -2.34775029e+03, -2.50815898e+03, -2.66856766e+03, -2.82897635e+03, + -2.98938503e+03, -3.14979371e+03, 1.49633714e+03, 1.34982543e+03, 1.20331372e+03, + 1.05680201e+03, 9.10290301e+02, 7.63778591e+02, 6.17266881e+02, 4.70755171e+02, + 3.24243460e+02, 1.77731750e+02, 3.12200399e+01, -1.15291670e+02, -2.61803380e+02, + -4.08315091e+02, -5.54826801e+02, -7.01338511e+02, -8.47850221e+02, -9.94361931e+02, + -1.14087364e+03, -1.28738535e+03, -1.43389706e+03, -1.58040877e+03, -1.72692048e+03, + -1.87343219e+03, -2.01994390e+03, -2.16645561e+03, -2.31296732e+03, -2.45947903e+03, + -2.60599074e+03, -2.75250245e+03, -2.89901416e+03, 1.33020746e+03, 1.19759273e+03, + 1.06497799e+03, 9.32363255e+02, 7.99748519e+02, 6.67133784e+02, 5.34519048e+02, + 4.01904312e+02, 2.69289576e+02, 1.36674840e+02, 4.06010412e+00, -1.28554632e+02, + -2.61169368e+02, -3.93784104e+02, -5.26398839e+02, -6.59013575e+02, -7.91628311e+02, + -9.24243047e+02, -1.05685778e+03, -1.18947252e+03, -1.32208725e+03, -1.45470199e+03, + -1.58731673e+03, -1.71993146e+03, -1.85254620e+03, -1.98516093e+03, -2.11777567e+03, + -2.25039041e+03, -2.38300514e+03, -2.51561988e+03, -2.64823461e+03, 1.16407778e+03, + 1.04536002e+03, 9.26642261e+02, 8.07924499e+02, 6.89206738e+02, 5.70488976e+02, + 4.51771215e+02, 3.33053453e+02, 2.14335691e+02, 9.56179299e+01, -2.30998317e+01, + -1.41817593e+02, -2.60535355e+02, -3.79253116e+02, -4.97970878e+02, -6.16688640e+02, + -7.35406401e+02, -8.54124163e+02, -9.72841924e+02, -1.09155969e+03, -1.21027745e+03, + -1.32899521e+03, -1.44771297e+03, -1.56643073e+03, -1.68514849e+03, -1.80386626e+03, + -1.92258402e+03, -2.04130178e+03, -2.16001954e+03, -2.27873730e+03, -2.39745506e+03, + 9.97948105e+02, 8.93127318e+02, 7.88306530e+02, 6.83485743e+02, 5.78664956e+02, + 4.73844169e+02, 3.69023381e+02, 2.64202594e+02, 1.59381807e+02, 5.45610197e+01, + -5.02597675e+01, -1.55080555e+02, -2.59901342e+02, -3.64722129e+02, -4.69542917e+02, + -5.74363704e+02, -6.79184491e+02, -7.84005278e+02, -8.88826065e+02, -9.93646853e+02, + -1.09846764e+03, -1.20328843e+03, -1.30810921e+03, -1.41293000e+03, -1.51775079e+03, + -1.62257158e+03, -1.72739236e+03, -1.83221315e+03, -1.93703394e+03, -2.04185473e+03, + -2.14667551e+03, 8.31818426e+02, 7.40894613e+02, 6.49970800e+02, 5.59046987e+02, + 4.68123174e+02, 3.77199361e+02, 2.86275548e+02, 1.95351735e+02, 1.04427923e+02, + 1.35041096e+01, -7.74197034e+01, -1.68343516e+02, -2.59267329e+02, -3.50191142e+02, + -4.41114955e+02, -5.32038768e+02, -6.22962581e+02, -7.13886394e+02, -8.04810207e+02, + -8.95734020e+02, -9.86657833e+02, -1.07758165e+03, -1.16850546e+03, -1.25942927e+03, + -1.35035308e+03, -1.44127690e+03, -1.53220071e+03, -1.62312452e+03, -1.71404834e+03, + -1.80497215e+03, -1.89589596e+03, 6.65688747e+02, 5.88661908e+02, 5.11635070e+02, + 4.34608231e+02, 3.57581393e+02, 2.80554554e+02, 2.03527715e+02, 1.26500877e+02, + 4.94740381e+01, -2.75528006e+01, -1.04579639e+02, -1.81606478e+02, -2.58633316e+02, + -3.35660155e+02, -4.12686994e+02, -4.89713832e+02, -5.66740671e+02, -6.43767509e+02, + -7.20794348e+02, -7.97821187e+02, -8.74848025e+02, -9.51874864e+02, -1.02890170e+03, + -1.10592854e+03, -1.18295538e+03, -1.25998222e+03, -1.33700906e+03, -1.41403590e+03, + -1.49106273e+03, -1.56808957e+03, -1.64511641e+03, 4.99559068e+02, 4.36429204e+02, + 3.73299339e+02, 3.10169475e+02, 2.47039611e+02, 1.83909747e+02, 1.20779882e+02, + 5.76500179e+01, -5.47984640e+00, -6.86097107e+01, -1.31739575e+02, -1.94869439e+02, + -2.57999304e+02, -3.21129168e+02, -3.84259032e+02, -4.47388897e+02, -5.10518761e+02, + -5.73648625e+02, -6.36778489e+02, -6.99908354e+02, -7.63038218e+02, -8.26168082e+02, + -8.89297947e+02, -9.52427811e+02, -1.01555768e+03, -1.07868754e+03, -1.14181740e+03, + -1.20494727e+03, -1.26807713e+03, -1.33120700e+03, -1.39433686e+03, 3.33429389e+02, + 2.84196499e+02, 2.34963609e+02, 1.85730719e+02, 1.36497829e+02, 8.72649391e+01, + 3.80320491e+01, -1.12008409e+01, -6.04337309e+01, -1.09666621e+02, -1.58899511e+02, + -2.08132401e+02, -2.57365291e+02, -3.06598181e+02, -3.55831071e+02, -4.05063961e+02, + -4.54296851e+02, -5.03529741e+02, -5.52762631e+02, -6.01995521e+02, -6.51228411e+02, + -7.00461301e+02, -7.49694191e+02, -7.98927081e+02, -8.48159971e+02, -8.97392861e+02, + -9.46625751e+02, -9.95858641e+02, -1.04509153e+03, -1.09432442e+03, -1.14355731e+03, + 1.67299710e+02, 1.31963794e+02, 9.66278787e+01, 6.12919631e+01, 2.59560474e+01, + -9.37986829e+00, -4.47157840e+01, -8.00516996e+01, -1.15387615e+02, -1.50723531e+02, + -1.86059447e+02, -2.21395362e+02, -2.56731278e+02, -2.92067194e+02, -3.27403109e+02, + -3.62739025e+02, -3.98074941e+02, -4.33410856e+02, -4.68746772e+02, -5.04082688e+02, + -5.39418603e+02, -5.74754519e+02, -6.10090435e+02, -6.45426350e+02, -6.80762266e+02, + -7.16098182e+02, -7.51434097e+02, -7.86770013e+02, -8.22105929e+02, -8.57441844e+02, + -8.92777760e+02, 1.17003110e+00, -2.02689103e+01, -4.17078516e+01, -6.31467930e+01, + -8.45857343e+01, -1.06024676e+02, -1.27463617e+02, -1.48902558e+02, -1.70341500e+02, + -1.91780441e+02, -2.13219382e+02, -2.34658324e+02, -2.56097265e+02, -2.77536207e+02, + -2.98975148e+02, -3.20414089e+02, -3.41853031e+02, -3.63291972e+02, -3.84730913e+02, + -4.06169855e+02, -4.27608796e+02, -4.49047737e+02, -4.70486679e+02, -4.91925620e+02, + -5.13364561e+02, -5.34803503e+02, -5.56242444e+02, -5.77681386e+02, -5.99120327e+02, + -6.20559268e+02, -6.41998210e+02, -1.64959648e+02, -1.72501615e+02, -1.80043582e+02, + -1.87585549e+02, -1.95127516e+02, -2.02669483e+02, -2.10211450e+02, -2.17753417e+02, + -2.25295384e+02, -2.32837351e+02, -2.40379318e+02, -2.47921285e+02, -2.55463252e+02, + -2.63005219e+02, -2.70547186e+02, -2.78089154e+02, -2.85631121e+02, -2.93173088e+02, + -3.00715055e+02, -3.08257022e+02, -3.15798989e+02, -3.23340956e+02, -3.30882923e+02, + -3.38424890e+02, -3.45966857e+02, -3.53508824e+02, -3.61050791e+02, -3.68592758e+02, + -3.76134725e+02, -3.83676692e+02, -3.91218659e+02, -3.31089327e+02, -3.24734320e+02, + -3.18379312e+02, -3.12024305e+02, -3.05669298e+02, -2.99314290e+02, -2.92959283e+02, + -2.86604276e+02, -2.80249269e+02, -2.73894261e+02, -2.67539254e+02, -2.61184247e+02, + -2.54829240e+02, -2.48474232e+02, -2.42119225e+02, -2.35764218e+02, -2.29409211e+02, + -2.23054203e+02, -2.16699196e+02, -2.10344189e+02, -2.03989181e+02, -1.97634174e+02, + -1.91279167e+02, -1.84924160e+02, -1.78569152e+02, -1.72214145e+02, -1.65859138e+02, + -1.59504131e+02, -1.53149123e+02, -1.46794116e+02, -1.40439109e+02, -4.97219006e+02, + -4.76967024e+02, -4.56715043e+02, -4.36463061e+02, -4.16211079e+02, -3.95959098e+02, + -3.75707116e+02, -3.55455135e+02, -3.35203153e+02, -3.14951172e+02, -2.94699190e+02, + -2.74447208e+02, -2.54195227e+02, -2.33943245e+02, -2.13691264e+02, -1.93439282e+02, + -1.73187300e+02, -1.52935319e+02, -1.32683337e+02, -1.12431356e+02, -9.21793741e+01, + -7.19273925e+01, -5.16754109e+01, -3.14234293e+01, -1.11714478e+01, 9.08053383e+00, + 2.93325154e+01, 4.95844970e+01, 6.98364786e+01, 9.00884602e+01, 1.10340442e+02, + -6.63348685e+02, -6.29199729e+02, -5.95050773e+02, -5.60901817e+02, -5.26752861e+02, + -4.92603905e+02, -4.58454949e+02, -4.24305993e+02, -3.90157038e+02, -3.56008082e+02, + -3.21859126e+02, -2.87710170e+02, -2.53561214e+02, -2.19412258e+02, -1.85263302e+02, + -1.51114346e+02, -1.16965390e+02, -8.28164345e+01, -4.86674786e+01, -1.45185227e+01, + 1.96304332e+01, 5.37793891e+01, 8.79283450e+01, 1.22077301e+02, 1.56226257e+02, + 1.90375213e+02, 2.24524169e+02, 2.58673125e+02, 2.92822080e+02, 3.26971036e+02, + 3.61119992e+02, -8.29478364e+02, -7.81432434e+02, -7.33386503e+02, -6.85340573e+02, + -6.37294643e+02, -5.89248713e+02, -5.41202782e+02, -4.93156852e+02, -4.45110922e+02, + -3.97064992e+02, -3.49019062e+02, -3.00973131e+02, -2.52927201e+02, -2.04881271e+02, + -1.56835341e+02, -1.08789411e+02, -6.07434803e+01, -1.26975501e+01, 3.53483801e+01, + 8.33943103e+01, 1.31440241e+02, 1.79486171e+02, 2.27532101e+02, 2.75578031e+02, + 3.23623961e+02, 3.71669892e+02, 4.19715822e+02, 4.67761752e+02, 5.15807682e+02, + 5.63853612e+02, 6.11899543e+02, -9.95608043e+02, -9.33665138e+02, -8.71722234e+02, + -8.09779329e+02, -7.47836425e+02, -6.85893520e+02, -6.23950616e+02, -5.62007711e+02, + -5.00064806e+02, -4.38121902e+02, -3.76178997e+02, -3.14236093e+02, -2.52293188e+02, + -1.90350284e+02, -1.28407379e+02, -6.64644748e+01, -4.52157026e+00, 5.74213343e+01, + 1.19364239e+02, 1.81307143e+02, 2.43250048e+02, 3.05192952e+02, 3.67135857e+02, + 4.29078761e+02, 4.91021666e+02, 5.52964571e+02, 6.14907475e+02, 6.76850380e+02, + 7.38793284e+02, 8.00736189e+02, 8.62679093e+02, -1.16173772e+03, -1.08589784e+03, + -1.01005796e+03, -9.34218085e+02, -8.58378206e+02, -7.82538327e+02, -7.06698449e+02, + -6.30858570e+02, -5.55018691e+02, -4.79178812e+02, -4.03338933e+02, -3.27499054e+02, + -2.51659176e+02, -1.75819297e+02, -9.99794179e+01, -2.41395390e+01, 5.17003398e+01, + 1.27540219e+02, 2.03380097e+02, 2.79219976e+02, 3.55059855e+02, 4.30899734e+02, + 5.06739613e+02, 5.82579492e+02, 6.58419371e+02, 7.34259249e+02, 8.10099128e+02, + 8.85939007e+02, 9.61778886e+02, 1.03761876e+03, 1.11345864e+03, -1.32786740e+03, + -1.23813055e+03, -1.14839369e+03, -1.05865684e+03, -9.68919988e+02, -8.79183135e+02, + -7.89446282e+02, -6.99709429e+02, -6.09972575e+02, -5.20235722e+02, -4.30498869e+02, + -3.40762016e+02, -2.51025163e+02, -1.61288310e+02, -7.15514565e+01, 1.81853967e+01, + 1.07922250e+02, 1.97659103e+02, 2.87395956e+02, 3.77132809e+02, 4.66869663e+02, + 5.56606516e+02, 6.46343369e+02, 7.36080222e+02, 8.25817075e+02, 9.15553928e+02, + 1.00529078e+03, 1.09502763e+03, 1.18476449e+03, 1.27450134e+03, 1.36423819e+03, + -1.49399708e+03, -1.39036325e+03, -1.28672942e+03, -1.18309560e+03, -1.07946177e+03, + -9.75827942e+02, -8.72194115e+02, -7.68560287e+02, -6.64926460e+02, -5.61292632e+02, + -4.57658805e+02, -3.54024977e+02, -2.50391150e+02, -1.46757322e+02, -4.31234950e+01, + 6.05103325e+01, 1.64144160e+02, 2.67777987e+02, 3.71411815e+02, 4.75045642e+02, + 5.78679470e+02, 6.82313297e+02, 7.85947125e+02, 8.89580952e+02, 9.93214780e+02, + 1.09684861e+03, 1.20048243e+03, 1.30411626e+03, 1.40775009e+03, 1.51138392e+03, + 1.61501774e+03, -1.66012676e+03, -1.54259596e+03, -1.42506516e+03, -1.30753435e+03, + -1.19000355e+03, -1.07247275e+03, -9.54941948e+02, -8.37411146e+02, -7.19880344e+02, + -6.02349543e+02, -4.84818741e+02, -3.67287939e+02, -2.49757137e+02, -1.32226335e+02, + -1.46955336e+01, 1.02835268e+02, 2.20366070e+02, 3.37896872e+02, 4.55427674e+02, + 5.72958475e+02, 6.90489277e+02, 8.08020079e+02, 9.25550881e+02, 1.04308168e+03, + 1.16061248e+03, 1.27814329e+03, 1.39567409e+03, 1.51320489e+03, 1.63073569e+03, + 1.74826649e+03, 1.86579730e+03, -1.82625644e+03, -1.69482866e+03, -1.56340089e+03, + -1.43197311e+03, -1.30054533e+03, -1.16911756e+03, -1.03768978e+03, -9.06262005e+02, + -7.74834229e+02, -6.43406453e+02, -5.11978677e+02, -3.80550900e+02, -2.49123124e+02, + -1.17695348e+02, 1.37324278e+01, 1.45160204e+02, 2.76587980e+02, 4.08015756e+02, + 5.39443532e+02, 6.70871308e+02, 8.02299084e+02, 9.33726861e+02, 1.06515464e+03, + 1.19658241e+03, 1.32801019e+03, 1.45943796e+03, 1.59086574e+03, 1.72229352e+03, + 1.85372129e+03, 1.98514907e+03, 2.11657685e+03, -1.99238612e+03, -1.84706137e+03, + -1.70173662e+03, -1.55641187e+03, -1.41108711e+03, -1.26576236e+03, -1.12043761e+03, + -9.75112864e+02, -8.29788113e+02, -6.84463363e+02, -5.39138612e+02, -3.93813862e+02, + -2.48489112e+02, -1.03164361e+02, 4.21603893e+01, 1.87485140e+02, 3.32809890e+02, + 4.78134641e+02, 6.23459391e+02, 7.68784141e+02, 9.14108892e+02, 1.05943364e+03, + 1.20475839e+03, 1.35008314e+03, 1.49540789e+03, 1.64073264e+03, 1.78605739e+03, + 1.93138214e+03, 2.07670690e+03, 2.22203165e+03, 2.36735640e+03, -2.15851580e+03, + -1.99929407e+03, -1.84007235e+03, -1.68085062e+03, -1.52162890e+03, -1.36240717e+03, + -1.20318545e+03, -1.04396372e+03, -8.84741998e+02, -7.25520273e+02, -5.66298548e+02, + -4.07076823e+02, -2.47855099e+02, -8.86333740e+01, 7.05883507e+01, 2.29810075e+02, + 3.89031800e+02, 5.48253525e+02, 7.07475250e+02, 8.66696974e+02, 1.02591870e+03, + 1.18514042e+03, 1.34436215e+03, 1.50358387e+03, 1.66280560e+03, 1.82202732e+03, + 1.98124905e+03, 2.14047077e+03, 2.29969250e+03, 2.45891422e+03, 2.61813595e+03 + ]), + 0.5: np.array([ + 2.05574728e+03, 1.88233772e+03, 1.70892816e+03, 1.53551859e+03, 1.36210903e+03, + 1.18869947e+03, 1.01528991e+03, 8.41880342e+02, 6.68470780e+02, 4.95061217e+02, + 3.21651655e+02, 1.48242092e+02, -2.51674701e+01, -1.98577033e+02, -3.71986595e+02, + -5.45396158e+02, -7.18805720e+02, -8.92215283e+02, -1.06562485e+03, -1.23903441e+03, + -1.41244397e+03, -1.58585353e+03, -1.75926310e+03, -1.93267266e+03, -2.10608222e+03, + -2.27949178e+03, -2.45290135e+03, -2.62631091e+03, -2.79972047e+03, -2.97313003e+03, + -3.14653960e+03, 1.89399572e+03, 1.73424230e+03, 1.57448889e+03, 1.41473547e+03, + 1.25498206e+03, 1.09522864e+03, 9.35475231e+02, 7.75721817e+02, 6.15968403e+02, + 4.56214988e+02, 2.96461574e+02, 1.36708160e+02, -2.30452540e+01, -1.82798668e+02, + -3.42552082e+02, -5.02305496e+02, -6.62058910e+02, -8.21812325e+02, -9.81565739e+02, + -1.14131915e+03, -1.30107257e+03, -1.46082598e+03, -1.62057940e+03, -1.78033281e+03, + -1.94008622e+03, -2.09983964e+03, -2.25959305e+03, -2.41934647e+03, -2.57909988e+03, + -2.73885329e+03, -2.89860671e+03, 1.73224415e+03, 1.58614689e+03, 1.44004962e+03, + 1.29395235e+03, 1.14785509e+03, 1.00175782e+03, 8.55660557e+02, 7.09563291e+02, + 5.63466025e+02, 4.17368759e+02, 2.71271494e+02, 1.25174228e+02, -2.09230379e+01, + -1.67020304e+02, -3.13117569e+02, -4.59214835e+02, -6.05312101e+02, -7.51409367e+02, + -8.97506632e+02, -1.04360390e+03, -1.18970116e+03, -1.33579843e+03, -1.48189570e+03, + -1.62799296e+03, -1.77409023e+03, -1.92018749e+03, -2.06628476e+03, -2.21238202e+03, + -2.35847929e+03, -2.50457656e+03, -2.65067382e+03, 1.57049259e+03, 1.43805147e+03, + 1.30561035e+03, 1.17316923e+03, 1.04072812e+03, 9.08287000e+02, 7.75845882e+02, + 6.43404765e+02, 5.10963648e+02, 3.78522530e+02, 2.46081413e+02, 1.13640296e+02, + -1.88008218e+01, -1.51241939e+02, -2.83683056e+02, -4.16124174e+02, -5.48565291e+02, + -6.81006409e+02, -8.13447526e+02, -9.45888643e+02, -1.07832976e+03, -1.21077088e+03, + -1.34321200e+03, -1.47565311e+03, -1.60809423e+03, -1.74053535e+03, -1.87297646e+03, + -2.00541758e+03, -2.13785870e+03, -2.27029982e+03, -2.40274093e+03, 1.40874102e+03, + 1.28995605e+03, 1.17117108e+03, 1.05238611e+03, 9.33601146e+02, 8.14816177e+02, + 6.96031208e+02, 5.77246239e+02, 4.58461270e+02, 3.39676301e+02, 2.20891332e+02, + 1.02106363e+02, -1.66786057e+01, -1.35463575e+02, -2.54248544e+02, -3.73033513e+02, + -4.91818482e+02, -6.10603451e+02, -7.29388419e+02, -8.48173388e+02, -9.66958357e+02, + -1.08574333e+03, -1.20452830e+03, -1.32331326e+03, -1.44209823e+03, -1.56088320e+03, + -1.67966817e+03, -1.79845314e+03, -1.91723811e+03, -2.03602308e+03, -2.15480805e+03, + 1.24698946e+03, 1.14186064e+03, 1.03673182e+03, 9.31602996e+02, 8.26474175e+02, + 7.21345354e+02, 6.16216534e+02, 5.11087713e+02, 4.05958893e+02, 3.00830072e+02, + 1.95701252e+02, 9.05724310e+01, -1.45563896e+01, -1.19685210e+02, -2.24814031e+02, + -3.29942851e+02, -4.35071672e+02, -5.40200492e+02, -6.45329313e+02, -7.50458134e+02, + -8.55586954e+02, -9.60715775e+02, -1.06584460e+03, -1.17097342e+03, -1.27610224e+03, + -1.38123106e+03, -1.48635988e+03, -1.59148870e+03, -1.69661752e+03, -1.80174634e+03, + -1.90687516e+03, 1.08523789e+03, 9.93765221e+02, 9.02292548e+02, 8.10819876e+02, + 7.19347204e+02, 6.27874532e+02, 5.36401860e+02, 4.44929187e+02, 3.53456515e+02, + 2.61983843e+02, 1.70511171e+02, 7.90384987e+01, -1.24341735e+01, -1.03906846e+02, + -1.95379518e+02, -2.86852190e+02, -3.78324862e+02, -4.69797534e+02, -5.61270207e+02, + -6.52742879e+02, -7.44215551e+02, -8.35688223e+02, -9.27160895e+02, -1.01863357e+03, + -1.11010624e+03, -1.20157891e+03, -1.29305158e+03, -1.38452426e+03, -1.47599693e+03, + -1.56746960e+03, -1.65894227e+03, 9.23486328e+02, 8.45669804e+02, 7.67853281e+02, + 6.90036757e+02, 6.12220233e+02, 5.34403709e+02, 4.56587185e+02, 3.78770662e+02, + 3.00954138e+02, 2.23137614e+02, 1.45321090e+02, 6.75045664e+01, -1.03119574e+01, + -8.81284812e+01, -1.65945005e+02, -2.43761529e+02, -3.21578053e+02, -3.99394576e+02, + -4.77211100e+02, -5.55027624e+02, -6.32844148e+02, -7.10660672e+02, -7.88477195e+02, + -8.66293719e+02, -9.44110243e+02, -1.02192677e+03, -1.09974329e+03, -1.17755981e+03, + -1.25537634e+03, -1.33319286e+03, -1.41100939e+03, 7.61734764e+02, 6.97574388e+02, + 6.33414013e+02, 5.69253637e+02, 5.05093262e+02, 4.40932887e+02, 3.76772511e+02, + 3.12612136e+02, 2.48451760e+02, 1.84291385e+02, 1.20131009e+02, 5.59706341e+01, + -8.18974132e+00, -7.23501167e+01, -1.36510492e+02, -2.00670868e+02, -2.64831243e+02, + -3.28991618e+02, -3.93151994e+02, -4.57312369e+02, -5.21472745e+02, -5.85633120e+02, + -6.49793495e+02, -7.13953871e+02, -7.78114246e+02, -8.42274622e+02, -9.06434997e+02, + -9.70595372e+02, -1.03475575e+03, -1.09891612e+03, -1.16307650e+03, 5.99983199e+02, + 5.49478972e+02, 4.98974745e+02, 4.48470518e+02, 3.97966291e+02, 3.47462064e+02, + 2.96957837e+02, 2.46453610e+02, 1.95949383e+02, 1.45445156e+02, 9.49409288e+01, + 4.44367018e+01, -6.06752523e+00, -5.65717522e+01, -1.07075979e+02, -1.57580206e+02, + -2.08084433e+02, -2.58588660e+02, -3.09092887e+02, -3.59597114e+02, -4.10101341e+02, + -4.60605568e+02, -5.11109795e+02, -5.61614022e+02, -6.12118249e+02, -6.62622476e+02, + -7.13126704e+02, -7.63630931e+02, -8.14135158e+02, -8.64639385e+02, -9.15143612e+02, + 4.38231634e+02, 4.01383556e+02, 3.64535477e+02, 3.27687399e+02, 2.90839320e+02, + 2.53991241e+02, 2.17143163e+02, 1.80295084e+02, 1.43447005e+02, 1.06598927e+02, + 6.97508481e+01, 3.29027695e+01, -3.94530913e+00, -4.07933878e+01, -7.76414664e+01, + -1.14489545e+02, -1.51337624e+02, -1.88185702e+02, -2.25033781e+02, -2.61881860e+02, + -2.98729938e+02, -3.35578017e+02, -3.72426095e+02, -4.09274174e+02, -4.46122253e+02, + -4.82970331e+02, -5.19818410e+02, -5.56666489e+02, -5.93514567e+02, -6.30362646e+02, + -6.67210725e+02, 2.76480070e+02, 2.53288140e+02, 2.30096209e+02, 2.06904279e+02, + 1.83712349e+02, 1.60520419e+02, 1.37328488e+02, 1.14136558e+02, 9.09446279e+01, + 6.77526977e+01, 4.45607674e+01, 2.13688372e+01, -1.82309304e+00, -2.50150233e+01, + -4.82069535e+01, -7.13988838e+01, -9.45908140e+01, -1.17782744e+02, -1.40974674e+02, + -1.64166605e+02, -1.87358535e+02, -2.10550465e+02, -2.33742395e+02, -2.56934326e+02, + -2.80126256e+02, -3.03318186e+02, -3.26510116e+02, -3.49702047e+02, -3.72893977e+02, + -3.96085907e+02, -4.19277837e+02, 1.14728505e+02, 1.05192723e+02, 9.56569416e+01, + 8.61211598e+01, 7.65853779e+01, 6.70495960e+01, 5.75138142e+01, 4.79780323e+01, + 3.84422505e+01, 2.89064686e+01, 1.93706868e+01, 9.83490491e+00, 2.99123057e-01, + -9.23665880e+00, -1.87724407e+01, -2.83082225e+01, -3.78440044e+01, -4.73797862e+01, + -5.69155681e+01, -6.64513499e+01, -7.59871318e+01, -8.55229136e+01, -9.50586955e+01, + -1.04594477e+02, -1.14130259e+02, -1.23666041e+02, -1.33201823e+02, -1.42737605e+02, + -1.52273387e+02, -1.61809168e+02, -1.71344950e+02, -4.70230592e+01, -4.29026927e+01, + -3.87823262e+01, -3.46619596e+01, -3.05415931e+01, -2.64212266e+01, -2.23008600e+01, + -1.81804935e+01, -1.40601270e+01, -9.93976045e+00, -5.81939391e+00, -1.69902738e+00, + 2.42133915e+00, 6.54170568e+00, 1.06620722e+01, 1.47824388e+01, 1.89028053e+01, + 2.30231718e+01, 2.71435384e+01, 3.12639049e+01, 3.53842714e+01, 3.95046379e+01, + 4.36250045e+01, 4.77453710e+01, 5.18657375e+01, 5.59861041e+01, 6.01064706e+01, + 6.42268371e+01, 6.83472037e+01, 7.24675702e+01, 7.65879367e+01, -2.08774624e+02, + -1.90998109e+02, -1.73221594e+02, -1.55445079e+02, -1.37668564e+02, -1.19892049e+02, + -1.02115534e+02, -8.43390194e+01, -6.65625044e+01, -4.87859895e+01, -3.10094746e+01, + -1.32329597e+01, 4.54355525e+00, 2.23200702e+01, 4.00965851e+01, 5.78731000e+01, + 7.56496149e+01, 9.34261299e+01, 1.11202645e+02, 1.28979160e+02, 1.46755675e+02, + 1.64532190e+02, 1.82308704e+02, 2.00085219e+02, 2.17861734e+02, 2.35638249e+02, + 2.53414764e+02, 2.71191279e+02, 2.88967794e+02, 3.06744309e+02, 3.24520824e+02, + -3.70526188e+02, -3.39093525e+02, -3.07660862e+02, -2.76228198e+02, -2.44795535e+02, + -2.13362872e+02, -1.81930209e+02, -1.50497545e+02, -1.19064882e+02, -8.76322186e+01, + -5.61995553e+01, -2.47668920e+01, 6.66577134e+00, 3.80984347e+01, 6.95310980e+01, + 1.00963761e+02, 1.32396425e+02, 1.63829088e+02, 1.95261751e+02, 2.26694415e+02, + 2.58127078e+02, 2.89559741e+02, 3.20992404e+02, 3.52425068e+02, 3.83857731e+02, + 4.15290394e+02, 4.46723058e+02, 4.78155721e+02, 5.09588384e+02, 5.41021048e+02, + 5.72453711e+02, -5.32277753e+02, -4.87188941e+02, -4.42100130e+02, -3.97011318e+02, + -3.51922506e+02, -3.06833694e+02, -2.61744883e+02, -2.16656071e+02, -1.71567259e+02, + -1.26478448e+02, -8.13896360e+01, -3.63008243e+01, 8.78798743e+00, 5.38767991e+01, + 9.89656108e+01, 1.44054423e+02, 1.89143234e+02, 2.34232046e+02, 2.79320858e+02, + 3.24409669e+02, 3.69498481e+02, 4.14587293e+02, 4.59676104e+02, 5.04764916e+02, + 5.49853728e+02, 5.94942540e+02, 6.40031351e+02, 6.85120163e+02, 7.30208975e+02, + 7.75297786e+02, 8.20386598e+02, -6.94029318e+02, -6.35284357e+02, -5.76539397e+02, + -5.17794437e+02, -4.59049477e+02, -4.00304517e+02, -3.41559557e+02, -2.82814597e+02, + -2.24069637e+02, -1.65324677e+02, -1.06579717e+02, -4.78347566e+01, 1.09102035e+01, + 6.96551636e+01, 1.28400124e+02, 1.87145084e+02, 2.45890044e+02, 3.04635004e+02, + 3.63379964e+02, 4.22124924e+02, 4.80869884e+02, 5.39614844e+02, 5.98359804e+02, + 6.57104764e+02, 7.15849725e+02, 7.74594685e+02, 8.33339645e+02, 8.92084605e+02, + 9.50829565e+02, 1.00957452e+03, 1.06831949e+03, -8.55780882e+02, -7.83379774e+02, + -7.10978665e+02, -6.38577557e+02, -5.66176448e+02, -4.93775340e+02, -4.21374231e+02, + -3.48973123e+02, -2.76572014e+02, -2.04170906e+02, -1.31769797e+02, -5.93686889e+01, + 1.30324196e+01, 8.54335281e+01, 1.57834637e+02, 2.30235745e+02, 3.02636854e+02, + 3.75037962e+02, 4.47439070e+02, 5.19840179e+02, 5.92241287e+02, 6.64642396e+02, + 7.37043504e+02, 8.09444613e+02, 8.81845721e+02, 9.54246830e+02, 1.02664794e+03, + 1.09904905e+03, 1.17145016e+03, 1.24385126e+03, 1.31625237e+03, -1.01753245e+03, + -9.31475190e+02, -8.45417933e+02, -7.59360676e+02, -6.73303419e+02, -5.87246162e+02, + -5.01188905e+02, -4.15131649e+02, -3.29074392e+02, -2.43017135e+02, -1.56959878e+02, + -7.09026211e+01, 1.51546357e+01, 1.01211893e+02, 1.87269149e+02, 2.73326406e+02, + 3.59383663e+02, 4.45440920e+02, 5.31498177e+02, 6.17555434e+02, 7.03612691e+02, + 7.89669947e+02, 8.75727204e+02, 9.61784461e+02, 1.04784172e+03, 1.13389897e+03, + 1.21995623e+03, 1.30601349e+03, 1.39207075e+03, 1.47812800e+03, 1.56418526e+03, + -1.17928401e+03, -1.07957061e+03, -9.79857201e+02, -8.80143795e+02, -7.80430390e+02, + -6.80716985e+02, -5.81003580e+02, -4.81290174e+02, -3.81576769e+02, -2.81863364e+02, + -1.82149959e+02, -8.24365534e+01, 1.72768518e+01, 1.16990257e+02, 2.16703662e+02, + 3.16417068e+02, 4.16130473e+02, 5.15843878e+02, 6.15557283e+02, 7.15270689e+02, + 8.14984094e+02, 9.14697499e+02, 1.01441090e+03, 1.11412431e+03, 1.21383771e+03, + 1.31355112e+03, 1.41326453e+03, 1.51297793e+03, 1.61269134e+03, 1.71240474e+03, + 1.81211815e+03, -1.34103558e+03, -1.22766602e+03, -1.11429647e+03, -1.00092691e+03, + -8.87557361e+02, -7.74187808e+02, -6.60818254e+02, -5.47448700e+02, -4.34079147e+02, + -3.20709593e+02, -2.07340039e+02, -9.39704857e+01, 1.93990679e+01, 1.32768622e+02, + 2.46138175e+02, 3.59507729e+02, 4.72877282e+02, 5.86246836e+02, 6.99616390e+02, + 8.12985943e+02, 9.26355497e+02, 1.03972505e+03, 1.15309460e+03, 1.26646416e+03, + 1.37983371e+03, 1.49320327e+03, 1.60657282e+03, 1.71994237e+03, 1.83331193e+03, + 1.94668148e+03, 2.06005103e+03, -1.50278714e+03, -1.37576144e+03, -1.24873574e+03, + -1.12171003e+03, -9.94684332e+02, -8.67658630e+02, -7.40632928e+02, -6.13607226e+02, + -4.86581524e+02, -3.59555822e+02, -2.32530120e+02, -1.05504418e+02, 2.15212840e+01, + 1.48546986e+02, 2.75572688e+02, 4.02598390e+02, 5.29624092e+02, 6.56649794e+02, + 7.83675496e+02, 9.10701198e+02, 1.03772690e+03, 1.16475260e+03, 1.29177830e+03, + 1.41880401e+03, 1.54582971e+03, 1.67285541e+03, 1.79988111e+03, 1.92690681e+03, + 2.05393252e+03, 2.18095822e+03, 2.30798392e+03, -1.66453870e+03, -1.52385685e+03, + -1.38317500e+03, -1.24249315e+03, -1.10181130e+03, -9.61129453e+02, -8.20447602e+02, + -6.79765752e+02, -5.39083902e+02, -3.98402051e+02, -2.57720201e+02, -1.17038350e+02, + 2.36435001e+01, 1.64325351e+02, 3.05007201e+02, 4.45689051e+02, 5.86370902e+02, + 7.27052752e+02, 8.67734603e+02, 1.00841645e+03, 1.14909830e+03, 1.28978015e+03, + 1.43046200e+03, 1.57114385e+03, 1.71182571e+03, 1.85250756e+03, 1.99318941e+03, + 2.13387126e+03, 2.27455311e+03, 2.41523496e+03, 2.55591681e+03, -1.82629027e+03, + -1.67195227e+03, -1.51761427e+03, -1.36327627e+03, -1.20893827e+03, -1.05460028e+03, + -9.00262277e+02, -7.45924278e+02, -5.91586279e+02, -4.37248280e+02, -2.82910281e+02, + -1.28572283e+02, 2.57657162e+01, 1.80103715e+02, 3.34441714e+02, 4.88779713e+02, + 6.43117711e+02, 7.97455710e+02, 9.51793709e+02, 1.10613171e+03, 1.26046971e+03, + 1.41480771e+03, 1.56914570e+03, 1.72348370e+03, 1.87782170e+03, 2.03215970e+03, + 2.18649770e+03, 2.34083570e+03, 2.49517370e+03, 2.64951170e+03, 2.80384969e+03 + ]), + 0.9: np.array([ + 2.37200934e+03, 2.19547028e+03, 2.01893121e+03, 1.84239214e+03, 1.66585308e+03, + 1.48931401e+03, 1.31277495e+03, 1.13623588e+03, 9.59696814e+02, 7.83157748e+02, + 6.06618682e+02, 4.30079616e+02, 2.53540550e+02, 7.70014835e+01, -9.95375825e+01, + -2.76076648e+02, -4.52615714e+02, -6.29154781e+02, -8.05693847e+02, -9.82232913e+02, + -1.15877198e+03, -1.33531104e+03, -1.51185011e+03, -1.68838918e+03, -1.86492824e+03, + -2.04146731e+03, -2.21800637e+03, -2.39454544e+03, -2.57108451e+03, -2.74762357e+03, + -2.92416264e+03, 2.20595640e+03, 2.04331369e+03, 1.88067097e+03, 1.71802825e+03, + 1.55538554e+03, 1.39274282e+03, 1.23010011e+03, 1.06745739e+03, 9.04814676e+02, + 7.42171961e+02, 5.79529245e+02, 4.16886530e+02, 2.54243814e+02, 9.16010988e+01, + -7.10416168e+01, -2.33684332e+02, -3.96327048e+02, -5.58969763e+02, -7.21612479e+02, + -8.84255194e+02, -1.04689791e+03, -1.20954063e+03, -1.37218334e+03, -1.53482606e+03, + -1.69746877e+03, -1.86011149e+03, -2.02275420e+03, -2.18539692e+03, -2.34803963e+03, + -2.51068235e+03, -2.67332507e+03, 2.03990346e+03, 1.89115709e+03, 1.74241073e+03, + 1.59366436e+03, 1.44491800e+03, 1.29617163e+03, 1.14742527e+03, 9.98678904e+02, + 8.49932539e+02, 7.01186174e+02, 5.52439809e+02, 4.03693444e+02, 2.54947079e+02, + 1.06200714e+02, -4.25456511e+01, -1.91292016e+02, -3.40038381e+02, -4.88784746e+02, + -6.37531111e+02, -7.86277476e+02, -9.35023841e+02, -1.08377021e+03, -1.23251657e+03, + -1.38126294e+03, -1.53000930e+03, -1.67875567e+03, -1.82750203e+03, -1.97624840e+03, + -2.12499476e+03, -2.27374113e+03, -2.42248749e+03, 1.87385052e+03, 1.73900050e+03, + 1.60415049e+03, 1.46930048e+03, 1.33445046e+03, 1.19960045e+03, 1.06475043e+03, + 9.29900417e+02, 7.95050402e+02, 6.60200388e+02, 5.25350373e+02, 3.90500358e+02, + 2.55650344e+02, 1.20800329e+02, -1.40496854e+01, -1.48899700e+02, -2.83749715e+02, + -4.18599729e+02, -5.53449744e+02, -6.88299758e+02, -8.23149773e+02, -9.57999788e+02, + -1.09284980e+03, -1.22769982e+03, -1.36254983e+03, -1.49739985e+03, -1.63224986e+03, + -1.76709988e+03, -1.90194989e+03, -2.03679990e+03, -2.17164992e+03, 1.70779758e+03, + 1.58684391e+03, 1.46589025e+03, 1.34493659e+03, 1.22398292e+03, 1.10302926e+03, + 9.82075593e+02, 8.61121929e+02, 7.40168265e+02, 6.19214601e+02, 4.98260937e+02, + 3.77307273e+02, 2.56353609e+02, 1.35399944e+02, 1.44462803e+01, -1.06507384e+02, + -2.27461048e+02, -3.48414712e+02, -4.69368376e+02, -5.90322040e+02, -7.11275704e+02, + -8.32229369e+02, -9.53183033e+02, -1.07413670e+03, -1.19509036e+03, -1.31604403e+03, + -1.43699769e+03, -1.55795135e+03, -1.67890502e+03, -1.79985868e+03, -1.92081235e+03, + 1.54174464e+03, 1.43468732e+03, 1.32763001e+03, 1.22057270e+03, 1.11351538e+03, + 1.00645807e+03, 8.99400755e+02, 7.92343442e+02, 6.85286128e+02, 5.78228814e+02, + 4.71171501e+02, 3.64114187e+02, 2.57056873e+02, 1.49999560e+02, 4.29422460e+01, + -6.41150677e+01, -1.71172381e+02, -2.78229695e+02, -3.85287009e+02, -4.92344322e+02, + -5.99401636e+02, -7.06458950e+02, -8.13516263e+02, -9.20573577e+02, -1.02763089e+03, + -1.13468820e+03, -1.24174552e+03, -1.34880283e+03, -1.45586015e+03, -1.56291746e+03, + -1.66997477e+03, 1.37569170e+03, 1.28253073e+03, 1.18936977e+03, 1.09620881e+03, + 1.00304784e+03, 9.09886880e+02, 8.16725917e+02, 7.23564954e+02, 6.30403991e+02, + 5.37243028e+02, 4.44082064e+02, 3.50921101e+02, 2.57760138e+02, 1.64599175e+02, + 7.14382117e+01, -2.17227515e+01, -1.14883715e+02, -2.08044678e+02, -3.01205641e+02, + -3.94366604e+02, -4.87527567e+02, -5.80688531e+02, -6.73849494e+02, -7.67010457e+02, + -8.60171420e+02, -9.53332383e+02, -1.04649335e+03, -1.13965431e+03, -1.23281527e+03, + -1.32597624e+03, -1.41913720e+03, 1.20963876e+03, 1.13037414e+03, 1.05110953e+03, + 9.71844917e+02, 8.92580304e+02, 8.13315692e+02, 7.34051079e+02, 6.54786466e+02, + 5.75521854e+02, 4.96257241e+02, 4.16992628e+02, 3.37728015e+02, 2.58463403e+02, + 1.79198790e+02, 9.99341773e+01, 2.06695646e+01, -5.85950481e+01, -1.37859661e+02, + -2.17124273e+02, -2.96388886e+02, -3.75653499e+02, -4.54918112e+02, -5.34182724e+02, + -6.13447337e+02, -6.92711950e+02, -7.71976562e+02, -8.51241175e+02, -9.30505788e+02, + -1.00977040e+03, -1.08903501e+03, -1.16829963e+03, 1.04358581e+03, 9.78217552e+02, + 9.12849290e+02, 8.47481028e+02, 7.82112765e+02, 7.16744503e+02, 6.51376241e+02, + 5.86007979e+02, 5.20639716e+02, 4.55271454e+02, 3.89903192e+02, 3.24534930e+02, + 2.59166668e+02, 1.93798405e+02, 1.28430143e+02, 6.30618808e+01, -2.30638144e+00, + -6.76746437e+01, -1.33042906e+02, -1.98411168e+02, -2.63779430e+02, -3.29147693e+02, + -3.94515955e+02, -4.59884217e+02, -5.25252479e+02, -5.90620742e+02, -6.55989004e+02, + -7.21357266e+02, -7.86725528e+02, -8.52093791e+02, -9.17462053e+02, 8.77532873e+02, + 8.26060962e+02, 7.74589050e+02, 7.23117138e+02, 6.71645226e+02, 6.20173315e+02, + 5.68701403e+02, 5.17229491e+02, 4.65757579e+02, 4.14285668e+02, 3.62813756e+02, + 3.11341844e+02, 2.59869932e+02, 2.08398020e+02, 1.56926109e+02, 1.05454197e+02, + 5.39822852e+01, 2.51037343e+00, -4.89615383e+01, -1.00433450e+02, -1.51905362e+02, + -2.03377274e+02, -2.54849185e+02, -3.06321097e+02, -3.57793009e+02, -4.09264921e+02, + -4.60736832e+02, -5.12208744e+02, -5.63680656e+02, -6.15152568e+02, -6.66624480e+02, + 7.11479933e+02, 6.73904371e+02, 6.36328810e+02, 5.98753249e+02, 5.61177687e+02, + 5.23602126e+02, 4.86026565e+02, 4.48451003e+02, 4.10875442e+02, 3.73299881e+02, + 3.35724320e+02, 2.98148758e+02, 2.60573197e+02, 2.22997636e+02, 1.85422074e+02, + 1.47846513e+02, 1.10270952e+02, 7.26953905e+01, 3.51198292e+01, -2.45573205e+00, + -4.00312933e+01, -7.76068546e+01, -1.15182416e+02, -1.52757977e+02, -1.90333539e+02, + -2.27909100e+02, -2.65484661e+02, -3.03060222e+02, -3.40635784e+02, -3.78211345e+02, + -4.15786906e+02, 5.45426992e+02, 5.21747781e+02, 4.98068570e+02, 4.74389359e+02, + 4.50710148e+02, 4.27030937e+02, 4.03351727e+02, 3.79672516e+02, 3.55993305e+02, + 3.32314094e+02, 3.08634883e+02, 2.84955673e+02, 2.61276462e+02, 2.37597251e+02, + 2.13918040e+02, 1.90238829e+02, 1.66559618e+02, 1.42880408e+02, 1.19201197e+02, + 9.55219860e+01, 7.18427752e+01, 4.81635644e+01, 2.44843535e+01, 8.05142715e-01, + -2.28740681e+01, -4.65532789e+01, -7.02324897e+01, -9.39117006e+01, -1.17590911e+02, + -1.41270122e+02, -1.64949333e+02, 3.79374051e+02, 3.69591190e+02, 3.59808330e+02, + 3.50025470e+02, 3.40242609e+02, 3.30459749e+02, 3.20676889e+02, 3.10894028e+02, + 3.01111168e+02, 2.91328308e+02, 2.81545447e+02, 2.71762587e+02, 2.61979726e+02, + 2.52196866e+02, 2.42414006e+02, 2.32631145e+02, 2.22848285e+02, 2.13065425e+02, + 2.03282564e+02, 1.93499704e+02, 1.83716844e+02, 1.73933983e+02, 1.64151123e+02, + 1.54368263e+02, 1.44585402e+02, 1.34802542e+02, 1.25019682e+02, 1.15236821e+02, + 1.05453961e+02, 9.56711006e+01, 8.58882402e+01, 2.13321110e+02, 2.17434600e+02, + 2.21548090e+02, 2.25661580e+02, 2.29775070e+02, 2.33888560e+02, 2.38002050e+02, + 2.42115541e+02, 2.46229031e+02, 2.50342521e+02, 2.54456011e+02, 2.58569501e+02, + 2.62682991e+02, 2.66796481e+02, 2.70909971e+02, 2.75023462e+02, 2.79136952e+02, + 2.83250442e+02, 2.87363932e+02, 2.91477422e+02, 2.95590912e+02, 2.99704402e+02, + 3.03817892e+02, 3.07931383e+02, 3.12044873e+02, 3.16158363e+02, 3.20271853e+02, + 3.24385343e+02, 3.28498833e+02, 3.32612323e+02, 3.36725813e+02, 4.72681688e+01, + 6.52780094e+01, 8.32878500e+01, 1.01297691e+02, 1.19307531e+02, 1.37317372e+02, + 1.55327212e+02, 1.73337053e+02, 1.91346894e+02, 2.09356734e+02, 2.27366575e+02, + 2.45376415e+02, 2.63386256e+02, 2.81396097e+02, 2.99405937e+02, 3.17415778e+02, + 3.35425618e+02, 3.53435459e+02, 3.71445300e+02, 3.89455140e+02, 4.07464981e+02, + 4.25474821e+02, 4.43484662e+02, 4.61494503e+02, 4.79504343e+02, 4.97514184e+02, + 5.15524024e+02, 5.33533865e+02, 5.51543705e+02, 5.69553546e+02, 5.87563387e+02, + -1.18784772e+02, -8.68785810e+01, -5.49723899e+01, -2.30661989e+01, 8.83999220e+00, + 4.07461833e+01, 7.26523743e+01, 1.04558565e+02, 1.36464756e+02, 1.68370948e+02, + 2.00277139e+02, 2.32183330e+02, 2.64089521e+02, 2.95995712e+02, 3.27901903e+02, + 3.59808094e+02, 3.91714285e+02, 4.23620476e+02, 4.55526667e+02, 4.87432858e+02, + 5.19339049e+02, 5.51245240e+02, 5.83151431e+02, 6.15057622e+02, 6.46963814e+02, + 6.78870005e+02, 7.10776196e+02, 7.42682387e+02, 7.74588578e+02, 8.06494769e+02, + 8.38400960e+02, -2.84837713e+02, -2.39035171e+02, -1.93232630e+02, -1.47430088e+02, + -1.01627547e+02, -5.58250053e+01, -1.00224638e+01, 3.57800778e+01, 8.15826193e+01, + 1.27385161e+02, 1.73187702e+02, 2.18990244e+02, 2.64792785e+02, 3.10595327e+02, + 3.56397869e+02, 4.02200410e+02, 4.48002952e+02, 4.93805493e+02, 5.39608035e+02, + 5.85410576e+02, 6.31213118e+02, 6.77015659e+02, 7.22818201e+02, 7.68620742e+02, + 8.14423284e+02, 8.60225825e+02, 9.06028367e+02, 9.51830909e+02, 9.97633450e+02, + 1.04343599e+03, 1.08923853e+03, -4.50890654e+02, -3.91191762e+02, -3.31492870e+02, + -2.71793978e+02, -2.12095086e+02, -1.52396194e+02, -9.26973018e+01, -3.29984098e+01, + 2.67004822e+01, 8.63993742e+01, 1.46098266e+02, 2.05797158e+02, 2.65496050e+02, + 3.25194942e+02, 3.84893834e+02, 4.44592726e+02, 5.04291618e+02, 5.63990510e+02, + 6.23689402e+02, 6.83388294e+02, 7.43087186e+02, 8.02786078e+02, 8.62484970e+02, + 9.22183862e+02, 9.81882754e+02, 1.04158165e+03, 1.10128054e+03, 1.16097943e+03, + 1.22067832e+03, 1.28037721e+03, 1.34007611e+03, -6.16943595e+02, -5.43348352e+02, + -4.69753110e+02, -3.96157867e+02, -3.22562625e+02, -2.48967382e+02, -1.75372140e+02, + -1.01776897e+02, -2.81816550e+01, 4.54135875e+01, 1.19008830e+02, 1.92604072e+02, + 2.66199315e+02, 3.39794557e+02, 4.13389800e+02, 4.86985042e+02, 5.60580285e+02, + 6.34175527e+02, 7.07770770e+02, 7.81366012e+02, 8.54961255e+02, 9.28556497e+02, + 1.00215174e+03, 1.07574698e+03, 1.14934222e+03, 1.22293747e+03, 1.29653271e+03, + 1.37012795e+03, 1.44372319e+03, 1.51731844e+03, 1.59091368e+03, -7.82996536e+02, + -6.95504943e+02, -6.08013350e+02, -5.20521757e+02, -4.33030164e+02, -3.45538571e+02, + -2.58046978e+02, -1.70555385e+02, -8.30637921e+01, 4.42780085e+00, 9.19193938e+01, + 1.79410987e+02, 2.66902580e+02, 3.54394173e+02, 4.41885766e+02, 5.29377359e+02, + 6.16868952e+02, 7.04360544e+02, 7.91852137e+02, 8.79343730e+02, 9.66835323e+02, + 1.05432692e+03, 1.14181851e+03, 1.22931010e+03, 1.31680170e+03, 1.40429329e+03, + 1.49178488e+03, 1.57927647e+03, 1.66676807e+03, 1.75425966e+03, 1.84175125e+03, + -9.49049477e+02, -8.47661533e+02, -7.46273590e+02, -6.44885646e+02, -5.43497703e+02, + -4.42109760e+02, -3.40721816e+02, -2.39333873e+02, -1.37945929e+02, -3.65579858e+01, + 6.48299576e+01, 1.66217901e+02, 2.67605844e+02, 3.68993788e+02, 4.70381731e+02, + 5.71769675e+02, 6.73157618e+02, 7.74545562e+02, 8.75933505e+02, 9.77321448e+02, + 1.07870939e+03, 1.18009734e+03, 1.28148528e+03, 1.38287322e+03, 1.48426117e+03, + 1.58564911e+03, 1.68703705e+03, 1.78842500e+03, 1.88981294e+03, 1.99120088e+03, + 2.09258883e+03, -1.11510242e+03, -9.99818124e+02, -8.84533830e+02, -7.69249536e+02, + -6.53965242e+02, -5.38680948e+02, -4.23396654e+02, -3.08112360e+02, -1.92828066e+02, + -7.75437725e+01, 3.77405214e+01, 1.53024815e+02, 2.68309109e+02, 3.83593403e+02, + 4.98877697e+02, 6.14161991e+02, 7.29446285e+02, 8.44730579e+02, 9.60014873e+02, + 1.07529917e+03, 1.19058346e+03, 1.30586775e+03, 1.42115205e+03, 1.53643634e+03, + 1.65172064e+03, 1.76700493e+03, 1.88228922e+03, 1.99757352e+03, 2.11285781e+03, + 2.22814211e+03, 2.34342640e+03, -1.28115536e+03, -1.15197471e+03, -1.02279407e+03, + -8.93613425e+02, -7.64432781e+02, -6.35252137e+02, -5.06071492e+02, -3.76890848e+02, + -2.47710204e+02, -1.18529559e+02, 1.06510852e+01, 1.39831730e+02, 2.69012374e+02, + 3.98193018e+02, 5.27373663e+02, 6.56554307e+02, 7.85734951e+02, 9.14915596e+02, + 1.04409624e+03, 1.17327688e+03, 1.30245753e+03, 1.43163817e+03, 1.56081882e+03, + 1.68999946e+03, 1.81918011e+03, 1.94836075e+03, 2.07754140e+03, 2.20672204e+03, + 2.33590268e+03, 2.46508333e+03, 2.59426397e+03, -1.44720830e+03, -1.30413130e+03, + -1.16105431e+03, -1.01797731e+03, -8.74900320e+02, -7.31823325e+02, -5.88746330e+02, + -4.45669336e+02, -3.02592341e+02, -1.59515346e+02, -1.64383510e+01, 1.26638644e+02, + 2.69715639e+02, 4.12792634e+02, 5.55869628e+02, 6.98946623e+02, 8.42023618e+02, + 9.85100613e+02, 1.12817761e+03, 1.27125460e+03, 1.41433160e+03, 1.55740859e+03, + 1.70048559e+03, 1.84356258e+03, 1.98663958e+03, 2.12971657e+03, 2.27279357e+03, + 2.41587056e+03, 2.55894756e+03, 2.70202455e+03, 2.84510155e+03, -1.61326124e+03, + -1.45628789e+03, -1.29931455e+03, -1.14234120e+03, -9.85367859e+02, -8.28394514e+02, + -6.71421168e+02, -5.14447823e+02, -3.57474478e+02, -2.00501133e+02, -4.35277872e+01, + 1.13445558e+02, 2.70418903e+02, 4.27392249e+02, 5.84365594e+02, 7.41338939e+02, + 8.98312285e+02, 1.05528563e+03, 1.21225898e+03, 1.36923232e+03, 1.52620567e+03, + 1.68317901e+03, 1.84015236e+03, 1.99712570e+03, 2.15409905e+03, 2.31107239e+03, + 2.46804574e+03, 2.62501908e+03, 2.78199243e+03, 2.93896577e+03, 3.09593912e+03 + ]) +} + # continuous wavetlet transform using the Haar wavelet from pywavelets, after # patching with pywavelets issue #365 and pywavelets pull request #580 to make # pywavelets's cwt work with the Haar wavelet; the input array was: diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py index 00f0888..addbb91 100644 --- a/tests/test_polynomial.py +++ b/tests/test_polynomial.py @@ -392,7 +392,7 @@ def test_outside_quantile_fails(self, quantile): with pytest.raises(ValueError): self.class_func(self.y, quantile=quantile) - @pytest.mark.parametrize('quantile', [0.1, 0.5, 0.9]) + @pytest.mark.parametrize('quantile', tuple(STATSMODELS_QUANTILES.keys())) def test_compare_to_statsmodels(self, quantile): """ Compares the output of quant_reg to statsmodels's quantile regression implementation. diff --git a/tests/two_d/test_polynomial.py b/tests/two_d/test_polynomial.py index 1662230..e6bff56 100644 --- a/tests/two_d/test_polynomial.py +++ b/tests/two_d/test_polynomial.py @@ -15,6 +15,7 @@ from pybaselines.two_d import polynomial from ..conftest import BasePolyTester2D, InputWeightsMixin +from ..data import STATSMODELS_QUANTILES_2D class PolynomialTester(BasePolyTester2D, InputWeightsMixin): @@ -206,6 +207,64 @@ def test_outside_quantile_fails(self, quantile): with pytest.raises(ValueError): self.class_func(self.y, quantile=quantile) + @pytest.mark.parametrize('quantile', tuple(STATSMODELS_QUANTILES_2D.keys())) + def test_compare_to_statsmodels(self, quantile): + """ + Compares the output of quant_reg to statsmodels's quantile regression implementation. + + The library statsmodels has a well-tested quantile regression implementation, + so can compare the output of polynomial.quant_reg to statsmodels to ensure + that the pybaselines implementation is correct. + + The outputs from statsmodels were created using:: + + from statsmodels.regression.quantile_regression import QuantReg + # map x and z to [-1, 1] to improve numerical stability for the Vandermonde + # within statsmodels + mapped_x = np.polynomial.polyutils.mapdomain( + x, np.polynomial.polyutils.getdomain(x), np.array([-1., 1.]) + ) + mapped_z = np.polynomial.polyutils.mapdomain( + z, np.polynomial.polyutils.getdomain(z), np.array([-1., 1.]) + ) + vander = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), 1 + ).reshape((-1, 4)) + fitter = QuantReg(y.ravel(), vander).fit(quantile, max_iter=1000, p_tol=1e-9).predict() + + with statsmodels version 0.13.2. + + Could also compare with the "true" quantile regression result using linear + programming such as detailed in: + + https://stats.stackexchange.com/questions/384909/formulating-quantile-regression-as- + linear-programming-problem + + but the comparison to statsmodels is good enough since it uses an iteratively + reweighted least squares calculation for the quantile regression similar to the + pybaselines implementation, and the linear programming requires a scipy version + of at least 1.0 or 1.6 to get a fast, reliable result due to the older solvers not + working as well. + + """ + x = np.linspace(-1000, 1000, 25) + z = np.linspace(-200, 301, 31) + + X, Z = np.meshgrid(x, z, indexing='ij') + y = ( + 3 + 1e-2 * X - 5e-1 * Z + 1e-2 * X * Z + ) + np.random.default_rng(0).normal(0, 200, X.shape) + + output = self.algorithm_base(x, z, check_finite=False, assume_sorted=True).quant_reg( + y, poly_order=1, quantile=quantile, tol=1e-9, eps=1e-12 + ) + + # use slightly high rtol since the number of data points is small for 2D to not bog + # down the data file; for higher number of points, rtol and atol could be reduced + assert_allclose( + output[0].ravel(), STATSMODELS_QUANTILES_2D[quantile], rtol=1e-5, atol=1e-10 + ) + class TestGoldindec(PolynomialTester): """Class for testing goldindec baseline.""" From 07e6fe72509df05a46284bd3ebf67bf733098320 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:03:14 -0500 Subject: [PATCH 27/56] FEAT: Solve some 2D Whittaker baselines as banded systems Allowed setting up the 2D penalized system with banded matrices for the simpler Whittaker algorithms. Time reduction is significant, especially for larger matrices. Probably not worth the effort to convert the more complex ones from sparse to banded. --- pybaselines/two_d/_algorithm_setup.py | 11 +- pybaselines/two_d/_spline_utils.py | 28 +++-- pybaselines/two_d/_whittaker_utils.py | 169 ++++++++++++++++++++++---- pybaselines/two_d/whittaker.py | 42 ++++--- tests/two_d/test_spline_utils.py | 3 + tests/two_d/test_whittaker.py | 87 +++++++------ tests/two_d/test_whittaker_utils.py | 88 +++++++++++--- 7 files changed, 326 insertions(+), 102 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 3b75f03..1ca9e0c 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -364,7 +364,7 @@ def _override_x(self, new_x, new_sort_order=None): self.pspline = old_pspline def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=False, - allow_lower=True, reverse_diags=None): + use_lower=True, use_banded=False, reverse_diags=None): """ Sets the starting parameters for doing penalized least squares. @@ -385,9 +385,12 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa copy_weights : boolean, optional If True, will copy the array of input weights. Only needed if the algorithm changes the weights in-place. Default is False. - allow_lower : boolean, optional + use_lower : boolean, optional If True (default), will allow using only the lower non-zero diagonals of the squared difference matrix. If False, will include all non-zero diagonals. + use_banded : bool, optional + If True, will setup the penalized system using banded matrices. If False, + will use sparse matrices. reverse_diags : {None, False, True}, optional If True, will reverse the order of the diagonals of the squared difference matrix. If False, will never reverse the diagonals. If None (default), will @@ -430,10 +433,10 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa weight_array = weight_array[self._sort_order] weight_array = weight_array.ravel() if self.whittaker_system is not None: - self.whittaker_system.reset_diagonals(lam, diff_order) + self.whittaker_system.reset_diagonals(lam, diff_order, use_banded, use_lower) else: self.whittaker_system = PenalizedSystem2D( - self._len, lam, diff_order + self._len, lam, diff_order, use_banded, use_lower ) return y.ravel(), weight_array diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index b011846..507c5d6 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -13,7 +13,7 @@ from .._banded_utils import difference_matrix from .._spline_utils import _spline_basis, _spline_knots from .._validation import _check_array, _check_lam, _check_scalar - +from ._whittaker_utils import diff_penalty_matrix class PSpline2D: """ @@ -190,12 +190,12 @@ def reset_penalty(self, lam=1, diff_order=2): 'the difference order must be less than the number of basis ' 'functions, which is the number of knots + spline degree - 1' )) - D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) - D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) + D1 = diff_penalty_matrix(self._num_bases[0], self.diff_order[0]) + D2 = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam - P1 = sparse.kron(self.lam[0] * D1.T @ D1, sparse.identity(self._num_bases[1])) - P2 = sparse.kron(sparse.identity(self._num_bases[0]), self.lam[1] * D2.T @ D2) + P1 = sparse.kron(self.lam[0] * D1, sparse.identity(self._num_bases[1])) + P2 = sparse.kron(sparse.identity(self._num_bases[0]), self.lam[1] * D2) self.penalty = P1 + P2 def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): @@ -237,17 +237,19 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): """ # do not save intermediate results since they are memory intensive for high number of knots - F = np.transpose( - (self._G.T @ weights @ self._G2).reshape( - (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) - ), - [0, 2, 1, 3] - ).reshape( - (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + F = sparse.csr_matrix( + np.transpose( + (self._G.T @ weights @ self._G2).reshape( + (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) + ), + [0, 2, 1, 3] + ).reshape( + (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + ) ) self.coef = spsolve( - sparse.csr_matrix(F) + self.penalty, + F + self.penalty, (self.basis_x.T @ (weights * y) @ self.basis_z).flatten(), 'NATURAL' ).reshape(self._num_bases[0], self._num_bases[1]) diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index cf6d398..7c875f8 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -6,13 +6,60 @@ """ -from scipy.sparse import identity, kron +import numpy as np +from scipy.linalg import solve_banded, solveh_banded +from scipy.sparse import identity, kron, spdiags from scipy.sparse.linalg import spsolve -from .._banded_utils import difference_matrix +from .._banded_utils import diff_penalty_diagonals from .._validation import _check_lam, _check_scalar +def diff_penalty_matrix(data_size, diff_order=2): + """ + Creates the finite difference penalty matrix. + + If `D` is the finite difference matrix, then the finite difference penalty + matrix is defined as ``D.T @ D``. + + Parameters + ---------- + data_size : int + The number of data points. + diff_order : int, optional + The integer differential order; must be >= 0. Default is 2. + + Returns + ------- + penalty_matrix : scipy.sparse.base.spmatrix + The sparse difference penalty matrix. + + Raises + ------ + ValueError + Raised if `diff_order` is greater or equal to `data_size`. + + Notes + ----- + Equivalent to calling:: + + from pybaselines.utils import difference_matrix + diff_matrix = difference_matrix(data_size, diff_order) + penalty_matrix = diff_matrix.T @ diff_matrix + + but should be faster since the bands within the penalty matrix can be gotten + without the matrix multiplication. + + """ + if data_size <= diff_order: + raise ValueError('data size must be greater than or equal to the difference order.') + penalty_bands = diff_penalty_diagonals(data_size, diff_order, lower_only=False) + penalty_matrix = spdiags( + penalty_bands, np.arange(diff_order, -diff_order - 1, -1), data_size, data_size + ) + return penalty_matrix + + class PenalizedSystem2D: """ An object for setting up and solving penalized least squares linear systems. @@ -44,25 +91,48 @@ class PenalizedSystem2D: and applying padding, but can also be changed by calling :meth:`.add_penalty`. Reset by calling :meth:`.reset_diagonals`. + Notes + ----- + Setting up the linear system using banded matrices is faster, but the number of bands is + actually quite large (`data_size[1]`) due to the Kronecker products, although only + ``2 * diff_order[0] + 2 * diff_order[1] + 2`` bands are actually nonzero. Despite this, it is + still significantly faster than using the sparse solver and does not use more memory as + long as it is only lower banded. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + """ - def __init__(self, data_size, lam=1, diff_order=2): + def __init__(self, data_size, lam=1, diff_order=2, use_banded=True, use_lower=True): """ Initializes the banded system. Parameters ---------- - data_size : int + data_size : Sequence[int, int] The number of data points for the system. lam : float, optional The penalty factor applied to the difference matrix. Larger values produce smoother results. Must be greater than 0. Default is 1. diff_order : int, optional The difference order of the penalty. Default is 2 (second order difference). + use_banded : bool, optional + If True (default), will do the setup for solving the system using banded + matrices rather than sparse matrices. + use_lower : bool, optional + If True (default), will allow only using the lower bands of the penalty matrix, + which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly + slower :func:`scipy.linalg.solve_banded`. Only relevant if `use_banded` is True. """ self._num_bases = data_size - self.reset_diagonals(lam, diff_order) + self.diff_order = [-1, -1] + self.lam = [-1, -1] + + self.reset_diagonals(lam, diff_order, use_banded, use_lower) def add_penalty(self, penalty): """ @@ -81,7 +151,22 @@ def add_penalty(self, penalty): """ raise NotImplementedError - def reset_diagonals(self, lam=1, diff_order=2): + def _update_bands(self): + """ + Updates the number of bands and the index of the main diagonal in `self.penalty`. + + Only relevant if setup as a banded matrix. + + """ + if self.banded: + if self.lower: + self.num_bands = self.penalty.shape[0] - 1 + else: + self.num_bands = self.penalty.shape[0] // 2 + self.main_diagonal_index = 0 if self.lower else self.num_bands + self.main_diagonal = self.penalty[self.main_diagonal_index].copy() + + def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): """ Resets the diagonals of the system and all of the attributes. @@ -94,24 +179,44 @@ def reset_diagonals(self, lam=1, diff_order=2): smoother results. Must be greater than 0. Default is 1. diff_order : int, optional The difference order of the penalty. Default is 2 (second order difference). + use_banded : bool, optional + If True (default), will do the setup for solving the system using banded + matrices rather than sparse matrices. """ self.diff_order = _check_scalar(diff_order, 2, True)[0] self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - + self.lower = use_lower + self.banded = use_banded if (self.diff_order < 1).any(): raise ValueError('the difference order must be > 0') - D1 = difference_matrix(self._num_bases[0], self.diff_order[0]) - D2 = difference_matrix(self._num_bases[1], self.diff_order[1]) + penalty_rows = diff_penalty_matrix(self._num_bases[0], self.diff_order[0]) + penalty_columns = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam - P1 = kron(self.lam[0] * D1.T @ D1, identity(self._num_bases[1])) - P2 = kron(identity(self._num_bases[0]), self.lam[1] * D2.T @ D2) - self.penalty = P1 + P2 - self.main_diagonal = self.penalty.diagonal() - - def solve(self, lhs, rhs): + P1 = kron(self.lam[0] * penalty_rows, identity(self._num_bases[1])) + P2 = kron(identity(self._num_bases[0]), self.lam[1] * penalty_columns) + penalty = P1 + P2 + if self.banded: + penalty = penalty.todia() + sparse_bands = (penalty).data + offsets = penalty.offsets + index_offset = np.max(offsets) + penalty_bands = np.zeros((index_offset * 2 + 1, sparse_bands.shape[1])) + for index, banded_index in enumerate(offsets): + penalty_bands[abs(banded_index - index_offset)] = sparse_bands[index] + self.penalty = penalty_bands + if self.lower: + self.penalty = self.penalty[self.penalty.shape[0] // 2:] + self._update_bands() + else: + self.penalty = penalty + self.main_diagonal = self.penalty.diagonal() + self.main_diagonal_index = 0 + + def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, + check_finite=False, l_and_u=None): """ Solves the equation ``A @ x = rhs``, given `A` in banded format as `lhs`. @@ -137,9 +242,6 @@ def solve(self, lhs, rhs): The number of lower and upper bands in `lhs` when using :func:`scipy.linalg.solve_banded`. Default is None, which uses (``len(lhs) // 2``, ``len(lhs) // 2``). - check_output : bool, optional - If True, will check the output for non-finite values when using - :func:`._pentapy_solver`. Default is False. Returns ------- @@ -147,17 +249,32 @@ def solve(self, lhs, rhs): The solution to the linear system, `x`. """ - output = spsolve(lhs, rhs, permc_spec='NATURAL') + if self.banded: + if self.lower: + output = solveh_banded( + lhs, rhs, overwrite_ab=overwrite_ab, + overwrite_b=overwrite_b, lower=True, check_finite=check_finite + ) + else: + if l_and_u is None: + num_bands = len(lhs) // 2 + l_and_u = (num_bands, num_bands) + output = solve_banded( + l_and_u, lhs, rhs, overwrite_ab=overwrite_ab, + overwrite_b=overwrite_b, check_finite=check_finite + ) + else: + output = spsolve(lhs, rhs, permc_spec='NATURAL') return output - def add_diagonal(self, array): + def add_diagonal(self, value): """ Adds a diagonal array to the original penalty matrix. Parameters ---------- - array : numpy.ndarray + value : numpy.ndarray The diagonal array to add to the penalty matrix. Returns @@ -166,12 +283,18 @@ def add_diagonal(self, array): The penalty matrix with the main diagonal updated. """ - self.penalty.setdiag(self.main_diagonal + array) + if self.banded: + self.penalty[self.main_diagonal_index] = self.main_diagonal + value + else: + self.penalty.setdiag(self.main_diagonal + value) return self.penalty def reset_diagonal(self): """Sets the main diagonal of the penalty matrix back to its original value.""" - self.penalty.setdiag(self.main_diagonal) + if self.banded: + self.penalty[self.main_diagonal_index] = self.main_diagonal + else: + self.penalty.setdiag(self.main_diagonal) def reverse_penalty(self): """ diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index 4350737..e07e6eb 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -83,11 +83,14 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') - y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, use_banded=True, use_lower=True + ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y, + overwrite_b=True ) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) @@ -163,7 +166,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') - elif diff_order < 2: + elif (np.asarray(diff_order) < 2).any(): raise ValueError('diff_order must be 2 or greater') if weights is None: @@ -174,7 +177,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, weights = _weighting._asls(data.ravel(), baseline, p).reshape(self._len) y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1, use_banded=False) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): penalized_system_1.add_diagonal(weight_array * weight_array) @@ -243,7 +246,8 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non """ y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, copy_weights=True + data, lam, diff_order, weights, copy_weights=True, use_banded=True, use_lower=True + ) y_l1_norm = np.abs(y).sum() tol_history = np.empty(max_iter + 1) @@ -254,7 +258,8 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non for i in range(1, max_iter + 2): try: output = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y, + overwrite_b=True ) except np.linalg.LinAlgError: warnings.warn( @@ -340,11 +345,14 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None penalized least squares smoothing. Analyst, 2015, 140, 250-257. """ - y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, use_banded=True, use_lower=True + ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y, + overwrite_b=True ) new_weights = _weighting._arpls(y, baseline) calc_difference = relative_difference(weight_array, new_weights) @@ -415,11 +423,11 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif """ if not 0 <= eta <= 1: raise ValueError('eta must be between 0 and 1') - elif diff_order < 2: + elif (np.asarray(diff_order) < 2).any(): raise ValueError('diff_order must be 2 or greater') y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1) + penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1, use_banded=False) # W + P_1 + (I - eta * W) @ P_n -> P_1 + P_n + W @ (I - eta * P_n) partial_penalty = self.whittaker_system.penalty + penalized_system_1.penalty partial_penalty_2 = -eta * self.whittaker_system.penalty @@ -503,11 +511,14 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non 59, 10933-10943. """ - y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, use_banded=True, use_lower=True + ) tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y, + overwrite_b=True ) new_weights = _weighting._iarpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) @@ -704,13 +715,16 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') - y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, use_banded=True, use_lower=True + ) if k is None: k = np.std(y) / 10 tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y + self.whittaker_system.add_diagonal(weight_array), weight_array * y, + overwrite_b=True ) new_weights = _weighting._psalsa(y, baseline, p, k, self._len[0] * self._len[1]) calc_difference = relative_difference(weight_array, new_weights) diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 9223e10..b9371c8 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -53,6 +53,9 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la weights = np.random.RandomState(0).normal(0.8, 0.05, y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) + # note: within Eiler's paper, the basis was defined as kron(basis_z, basis_x), + # but the rows and columns were switched, ie. it should be kron(basis_rows, basis_columns), + # so it is just a nomenclature difference basis = kron(basis_x, basis_z) CWT = basis.multiply( np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) diff --git a/tests/two_d/test_whittaker.py b/tests/two_d/test_whittaker.py index 8b76935..ccc6d90 100644 --- a/tests/two_d/test_whittaker.py +++ b/tests/two_d/test_whittaker.py @@ -41,11 +41,10 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) class TestIAsLS(WhittakerTester): @@ -59,16 +58,21 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (2, 3)) + @pytest.mark.parametrize('diff_order', (2, [3, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {2: 1e6, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) def test_diff_order_one_fails(self): """Ensure that a difference order of 1 raises an exception.""" with pytest.raises(ValueError): self.class_func(self.y, lam=1e2, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[2, 1]) class TestAirPLS(WhittakerTester): @@ -76,12 +80,12 @@ class TestAirPLS(WhittakerTester): func_name = 'airpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e3, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @pytest.mark.filterwarnings('ignore::RuntimeWarning') def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): @@ -102,9 +106,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with pytest.warns(ParameterWarning): - baseline = self.class_func(y, tol=-1, max_iter=3000)[0] + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=3000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() class TestArPLS(WhittakerTester): @@ -112,12 +118,12 @@ class TestArPLS(WhittakerTester): func_name = 'arpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ Ensures no warning is emitted for exponential overflow. @@ -132,9 +138,11 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with np.errstate(over='raise'): - baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() class TestDrPLS(WhittakerTester): @@ -148,17 +156,23 @@ def test_outside_eta_fails(self, eta): with pytest.raises(ValueError): self.class_func(self.y, eta=eta) - @pytest.mark.parametrize('diff_order', (2, 3)) + @pytest.mark.parametrize('diff_order', (2, [3, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {2: 1e5, 3: 1e9}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) def test_diff_order_one_fails(self): """Ensure that a difference order of 1 raises an exception.""" with pytest.raises(ValueError): self.class_func(self.y, lam=1e2, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[2, 1]) + @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @pytest.mark.filterwarnings('ignore::RuntimeWarning') def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): @@ -179,9 +193,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with pytest.warns(ParameterWarning): - baseline, params = self.class_func(y, tol=-1, max_iter=1000) + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline).all()) + assert np.isfinite(baseline).all() # ensure last tolerence calculation was non-finite as a double-check that # this test is actually doing what it should be doing assert not np.isfinite(params['tol_history'][-1]) @@ -192,12 +208,12 @@ class TestIArPLS(WhittakerTester): func_name = 'iarpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @pytest.mark.filterwarnings('ignore::RuntimeWarning') def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): @@ -218,9 +234,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with pytest.warns(ParameterWarning): - baseline, params = self.class_func(y, tol=-1, max_iter=1000) + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() # ensure last tolerence calculation was non-finite as a double-check that # this test is actually doing what it should be doing assert not np.isfinite(params['tol_history'][-1]) @@ -233,11 +251,10 @@ class TestAsPLS(WhittakerTester): checked_keys = ('weights', 'alpha', 'tol_history') weight_keys = ('weights', 'alpha') - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e4, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) @pytest.mark.parametrize('alpha_enum', (0, 1)) def test_wrong_alpha_shape(self, alpha_enum): @@ -249,6 +266,7 @@ def test_wrong_alpha_shape(self, alpha_enum): with pytest.raises(ValueError): self.class_func(self.y, alpha=alpha) + @pytest.mark.skip(reason='test is too slow') def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ Ensures no warning is emitted for exponential overflow. @@ -263,9 +281,11 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with np.errstate(over='raise'): - baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() class TestPsalsa(WhittakerTester): @@ -279,8 +299,7 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, [1, 2])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 26e19e1..36f18e4 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -20,7 +20,8 @@ @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) -def test_solve_penalized_system(small_data2d, diff_order, lam): +@pytest.mark.parametrize('use_banded', (True, False)) +def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded): """ Tests the accuracy of the penalized system solver. @@ -42,7 +43,7 @@ def test_solve_penalized_system(small_data2d, diff_order, lam): penalty = P1 + P2 penalized_system = _whittaker_utils.PenalizedSystem2D( - small_data2d.shape, lam=lam, diff_order=diff_order + small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 @@ -50,17 +51,20 @@ def test_solve_penalized_system(small_data2d, diff_order, lam): weights = np.clip(weights, 0, 1).astype(float, copy=False).ravel() penalty.setdiag(penalty.diagonal() + weights) - penalized_system.penalty.setdiag(penalized_system.penalty.diagonal() + weights) expected_result = spsolve(penalty, weights * small_data2d.flatten()) - output = penalized_system.solve(penalized_system.penalty, weights * small_data2d.flatten()) + output = penalized_system.solve( + penalized_system.add_diagonal(weights), weights * small_data2d.flatten() + ) assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) -def test_penalized_system_setup(small_data2d, diff_order, lam): +@pytest.mark.parametrize('use_banded', (True, False)) +@pytest.mark.parametrize('use_lower', (True, False)) +def test_penalized_system_setup(small_data2d, diff_order, lam, use_banded, use_lower): """Ensure the PenalizedSystem2D setup is correct.""" *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( lam=lam, diff_order=diff_order @@ -76,14 +80,37 @@ def test_penalized_system_setup(small_data2d, diff_order, lam): penalty = P1 + P2 penalized_system = _whittaker_utils.PenalizedSystem2D( - small_data2d.shape, lam=lam, diff_order=diff_order + small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, + use_lower=use_lower ) assert_array_equal(penalized_system._num_bases, num_bases) - assert issparse(penalized_system.penalty) - - assert_allclose(penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12) + if use_banded: + assert isinstance(penalized_system.penalty, np.ndarray) + penalty = penalty.todia() + penalty_bands = penalty.data[::-1] + # PenalizedSystem2D uses a more efficient way to assign bands, but + # this way is more clear of what is going on + offsets = list(penalty.offsets) + filler = np.zeros(penalty_bands.shape[1]) + values = [] + for i in range(offsets[0], offsets[-1] + 1): + if i in offsets: + values.append(penalty_bands[offsets.index(i)]) + else: + values.append(filler) + full_penalty_bands = np.vstack(values) + if use_lower: + full_penalty_bands = full_penalty_bands[full_penalty_bands.shape[0] // 2:] + assert_allclose( + penalized_system.penalty, full_penalty_bands, rtol=1e-12, atol=1e-12 + ) + else: + assert issparse(penalized_system.penalty) + assert_allclose( + penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 + ) assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) assert_array_equal(penalized_system.lam, (lam_x, lam_z)) @@ -105,7 +132,8 @@ def test_penalized_system_negative_lam_fails(small_data2d, lam): @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) -def test_compare_to_psplines(data_fixture2d, lam, diff_order): +@pytest.mark.parametrize('use_banded', (True, False)) +def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded): """ Ensures 2D Whittaker and PSpline outputs are the same for specific condition. @@ -129,15 +157,47 @@ def test_compare_to_psplines(data_fixture2d, lam, diff_order): assert_array_equal(pspline.basis_x.shape, (len(x), len(x))) assert_array_equal(pspline.basis_z.shape, (len(z)), len(z)) - whittaker_system = _whittaker_utils.PenalizedSystem2D(y.shape, lam=lam, diff_order=diff_order) + whittaker_system = _whittaker_utils.PenalizedSystem2D( + y.shape, lam=lam, diff_order=diff_order, use_banded=use_banded + ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 weights = np.random.RandomState(0).normal(0.8, 0.05, y.shape) weights = np.clip(weights, 0, 1).astype(float, copy=False) - whittaker_system.penalty.setdiag(whittaker_system.penalty.diagonal() + weights.ravel()) - spline_output = pspline.solve_pspline(y, weights=weights) - whittaker_output = whittaker_system.solve(whittaker_system.penalty, weights.ravel() * y.ravel()) + whittaker_output = whittaker_system.solve( + whittaker_system.add_diagonal(weights.ravel()), weights.ravel() * y.ravel() + ) assert_allclose(whittaker_output.reshape(y.shape), spline_output, rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('data_size', (10, 51)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + output = _whittaker_utils.diff_penalty_matrix(data_size, diff_order) + + assert_allclose(expected_matrix.toarray(), output.toarray(), rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('data_size', (3, 6)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix_too_few_data(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + if data_size <= diff_order: + with pytest.raises(ValueError): + _whittaker_utils.diff_penalty_matrix(data_size, diff_order) + # the actual matrix should be just zeros + actual_result = np.zeros((data_size, data_size)) + assert_allclose(actual_result, expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + else: + output = _whittaker_utils.diff_penalty_matrix(data_size, diff_order) + assert_allclose(output.toarray(), expected_matrix.toarray(), rtol=1e-12, atol=1e-12) From 9b41267107316fd505cff742bfaa161a00f1287b Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:05:58 -0500 Subject: [PATCH 28/56] TEST: Reduce size of 2D datasets for testing The small datasets reduced test time from 200 seconds to 100 seconds and still address what needs to be tested. Also skip the tests for spline non-finite weighting since they are very slow; may just take them out since they are covered by the 1D tests. --- tests/conftest.py | 16 +++++++++++----- tests/two_d/test_spline.py | 23 ++++++++++++++++------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index eb29f17..16f84f0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -123,7 +123,7 @@ def get_data(include_noise=True, num_points=1000): return x_data, y_data -def get_data2d(include_noise=True, num_points=(50, 60)): +def get_data2d(include_noise=True, num_points=(30, 41)): """Creates x-, z-, and y-data for testing. Parameters @@ -132,7 +132,7 @@ def get_data2d(include_noise=True, num_points=(50, 60)): If True (default), will include noise with the y-data. num_points : Container(int, int), optional The number of data points to use for x, and z, respectively. Default - is (50, 60), which uses different numbers so that any issues caused + is (30, 41), which uses different numbers so that any issues caused by not having a square matrix will be seen. Returns @@ -201,7 +201,7 @@ def small_data(): @pytest.fixture def small_data2d(): """A small array of data for testing.""" - return np.arange(50, dtype=float).reshape(5, 10) + return np.arange(60, dtype=float).reshape(6, 10) @pytest.fixture() @@ -224,8 +224,14 @@ def no_noise_data_fixture(): @pytest.fixture() def no_noise_data_fixture2d(): - """Test fixture that creates x-, z-, and y-data without noise for testing.""" - return get_data2d(include_noise=False) + """ + Test fixture that creates x-, z-, and y-data without noise for testing. + + Reduces the number of data points since this is used for testing that numerical + issues are avoided for large iterations in spline and Whittaker functions, which + can otherwise be time consuming. + """ + return get_data2d(include_noise=False, num_points=(20, 31)) def dummy_wrapper(func): diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py index f8700c9..c7296ac 100644 --- a/tests/two_d/test_spline.py +++ b/tests/two_d/test_spline.py @@ -261,6 +261,7 @@ def test_diff_orders(self, diff_order): lam = {1: 1e3, 3: 1e10}[diff_order] self.class_func(self.y, lam=lam, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @pytest.mark.filterwarnings('ignore::RuntimeWarning') def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): @@ -281,8 +282,10 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with pytest.warns(utils.ParameterWarning): - baseline = self.class_func(y, tol=-1, max_iter=7000)[0] - assert np.isfinite(baseline.T.dot(baseline)).all() + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=7000 + ) + assert np.isfinite(baseline).all() @pytest.mark.parametrize('lam', (1e1, 1e5)) def test_whittaker_comparison(self, lam): @@ -301,6 +304,7 @@ def test_diff_orders(self, diff_order): lam = {1: 1e2, 3: 1e10}[diff_order] self.class_func(self.y, lam=lam, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ Ensures no warning is emitted for exponential overflow. @@ -315,9 +319,11 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with np.errstate(over='raise'): - baseline = self.class_func(y, tol=-1, max_iter=1000)[0] + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() @pytest.mark.parametrize('lam', (1e1, 1e5)) def test_whittaker_comparison(self, lam): @@ -336,6 +342,7 @@ def test_diff_orders(self, diff_order): lam = {1: 1e2, 3: 1e10}[diff_order] self.class_func(self.y, lam=lam, diff_order=diff_order) + @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @pytest.mark.filterwarnings('ignore::RuntimeWarning') def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): @@ -356,9 +363,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): """ x, z, y = no_noise_data_fixture2d with pytest.warns(utils.ParameterWarning): - baseline, params = self.class_func(y, tol=-1, max_iter=1000) + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) - assert np.isfinite(baseline.T.dot(baseline)).all() + assert np.isfinite(baseline).all() # ensure last tolerence calculation was non-finite as a double-check that # this test is actually doing what it should be doing assert not np.isfinite(params['tol_history'][-1]) @@ -390,5 +399,5 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('p', (0.01, 0.1)) def test_whittaker_comparison(self, lam, p): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'psalsa', self.y, lam=lam, p=p) + compare_pspline_whittaker(self, 'psalsa', self.y, lam=lam, p=p, test_rtol=1e5) From 1fb5ef2903128998740c5ac610cc532a1d4f8a79 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Tue, 16 Jan 2024 21:18:16 -0500 Subject: [PATCH 29/56] MAINT: Clean up leftover code --- pybaselines/two_d/_spline_utils.py | 2 +- pybaselines/two_d/whittaker.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 507c5d6..ce0f4c3 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -10,11 +10,11 @@ from scipy import sparse from scipy.sparse.linalg import spsolve -from .._banded_utils import difference_matrix from .._spline_utils import _spline_basis, _spline_knots from .._validation import _check_array, _check_lam, _check_scalar from ._whittaker_utils import diff_penalty_matrix + class PSpline2D: """ A Penalized Spline, which penalizes the difference of the spline coefficients. diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index e07e6eb..1161b9d 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -185,8 +185,6 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, self.whittaker_system.penalty + penalized_system_1.penalty, penalized_system_1.penalty * y ) - assert y.shape == (self._len[0] * self._len[1],) - assert baseline.shape == (self._len[0] * self._len[1],) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -618,7 +616,7 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, alpha_matrix = diags(alpha_array.ravel(), format='csr') tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - lhs = alpha_matrix * self.whittaker_system.penalty + lhs = alpha_matrix @ self.whittaker_system.penalty lhs.setdiag(lhs.diagonal() + weight_array) baseline = self.whittaker_system.solve( lhs, weight_array * y From 598eb9183a48f4999702e1d2b8d3db1e360710d3 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:59:45 -0500 Subject: [PATCH 30/56] MAINT: Make PSpline2D subclass of PenalizedSystem2D --- pybaselines/two_d/_spline_utils.py | 67 ++++++++++++++------------- pybaselines/two_d/_whittaker_utils.py | 17 +++++-- pybaselines/two_d/spline.py | 2 +- pybaselines/two_d/whittaker.py | 16 ++++--- pybaselines/whittaker.py | 2 +- tests/two_d/test_spline_utils.py | 14 +++++- tests/two_d/test_whittaker_utils.py | 11 +++-- 7 files changed, 81 insertions(+), 48 deletions(-) diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index ce0f4c3..758d949 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -12,10 +12,10 @@ from .._spline_utils import _spline_basis, _spline_knots from .._validation import _check_array, _check_lam, _check_scalar -from ._whittaker_utils import diff_penalty_matrix +from ._whittaker_utils import PenalizedSystem2D -class PSpline2D: +class PSpline2D(PenalizedSystem2D): """ A Penalized Spline, which penalizes the difference of the spline coefficients. @@ -95,6 +95,9 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam (``num_knots + spline_degree - 1``). """ + self.coef = None + self._basis = None + self.x = _check_array(x, dtype=float, check_finite=check_finite, ensure_1d=True) self.z = _check_array(z, dtype=float, check_finite=check_finite, ensure_1d=True) @@ -103,24 +106,27 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam if (self.spline_degree < 0).any(): raise ValueError('spline degree must be >= 0') - elif (self.spline_degree < 0).any(): - raise ValueError('spline degree must be greater than or equal to 0') self.knots_x = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) self.basis_x = _spline_basis(self.x, self.knots_x, self.spline_degree[0]) self.knots_z = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) self.basis_z = _spline_basis(self.z, self.knots_z, self.spline_degree[1]) - self._num_bases = np.array((self.basis_x.shape[1], self.basis_z.shape[1])) + + super().__init__( + (self.basis_x.shape[1], self.basis_z.shape[1]), lam, diff_order, use_banded=False + ) + if (self.diff_order >= self._num_bases).any(): + raise ValueError(( + 'the difference order must be less than the number of basis ' + 'functions, which is the number of knots + spline degree - 1' + )) el = np.ones((self._num_bases[0], 1)) ek = np.ones((self._num_bases[1], 1)) self._G = sparse.kron(self.basis_x, el.T).multiply(sparse.kron(el.T, self.basis_x)) self._G2 = sparse.kron(self.basis_z, ek.T).multiply(sparse.kron(ek.T, self.basis_z)) - self.coef = None - self.reset_penalty(lam, diff_order) - def same_basis(self, num_knots=100, spline_degree=3): """ Sees if the current basis is equivalent to the input number of knots of spline degree. @@ -180,23 +186,7 @@ def reset_penalty(self, lam=1, diff_order=2): basis and the penalty to speed up calculations when the two are added. """ - self.diff_order = _check_scalar(diff_order, 2, True)[0] - self.lam = np.array([_check_lam(val) for val in _check_scalar(lam, 2, True)[0]]) - - if (self.diff_order < 1).any(): - raise ValueError('the difference order must be > 0 for penalized splines') - elif (self.diff_order >= self._num_bases).any(): - raise ValueError(( - 'the difference order must be less than the number of basis ' - 'functions, which is the number of knots + spline degree - 1' - )) - D1 = diff_penalty_matrix(self._num_bases[0], self.diff_order[0]) - D2 = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) - - # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam - P1 = sparse.kron(self.lam[0] * D1, sparse.identity(self._num_bases[1])) - P2 = sparse.kron(sparse.identity(self._num_bases[0]), self.lam[1] * D2) - self.penalty = P1 + P2 + self.reset_diagonals(lam, diff_order, use_banded=False) def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): """ @@ -247,17 +237,32 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) ) ) + if penalty is None: + penalty = self.penalty + + rhs = (self.basis_x.T @ (weights * y) @ self.basis_z).ravel() + if rhs_extra is not None: + rhs = rhs + rhs_extra - self.coef = spsolve( - F + self.penalty, - (self.basis_x.T @ (weights * y) @ self.basis_z).flatten(), - 'NATURAL' - ).reshape(self._num_bases[0], self._num_bases[1]) + self.coef = spsolve(F + penalty, rhs, permc_spec='NATURAL') - output = self.basis_x @ self.coef @ self.basis_z.T + output = self.basis_x @ self.coef.reshape(self._num_bases) @ self.basis_z.T return output + @property + def basis(self): + """ + The full spline basis matrix. + + This is a lazy implementation since the full basis is typically not needed for + computations. + + """ + if self._basis is None: + self._basis = sparse.kron(self.basis_x, self.basis_z) + return self._basis + @property def tck(self): """ diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 7c875f8..0438e8a 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -11,7 +11,7 @@ from scipy.sparse import identity, kron, spdiags from scipy.sparse.linalg import spsolve -from .._banded_utils import diff_penalty_diagonals +from .._banded_utils import _add_diagonals, diff_penalty_diagonals from .._validation import _check_lam, _check_scalar @@ -149,7 +149,13 @@ def add_penalty(self, penalty): The updated `self.penalty`. """ - raise NotImplementedError + if self.banded: + self.penalty = _add_diagonals(self.penalty, penalty, lower_only=self.lower) + else: + self.penalty = self.penalty + penalty + self._update_bands() + + return self.penalty def _update_bands(self): """ @@ -165,6 +171,9 @@ def _update_bands(self): self.num_bands = self.penalty.shape[0] // 2 self.main_diagonal_index = 0 if self.lower else self.num_bands self.main_diagonal = self.penalty[self.main_diagonal_index].copy() + else: + self.main_diagonal_index = 0 + self.main_diagonal = self.penalty.diagonal() def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): """ @@ -212,8 +221,8 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): self._update_bands() else: self.penalty = penalty - self.main_diagonal = self.penalty.diagonal() - self.main_diagonal_index = 0 + + self._update_bands() def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, check_finite=False, l_and_u=None): diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index b79b342..ee3ea9f 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -269,7 +269,7 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, y, weight_array = self._setup_spline( data, weights, spline_degree, num_knots, True, diff_order, lam ) - old_coef = np.zeros((self.pspline._num_bases[0], self.pspline._num_bases[1])) + old_coef = np.zeros(self.pspline._num_bases[0] * self.pspline._num_bases[1]) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): baseline = self.pspline.solve_pspline(y, weight_array) diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index 1161b9d..ec7d877 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -166,7 +166,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') - elif (np.asarray(diff_order) < 2).any(): + elif np.less(diff_order, 2).any(): raise ValueError('diff_order must be 2 or greater') if weights is None: @@ -174,16 +174,20 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, data, weights=None, poly_order=2, calc_vander=True, calc_pinv=True ) baseline = self.vandermonde @ (pseudo_inverse @ data.ravel()) - weights = _weighting._asls(data.ravel(), baseline, p).reshape(self._len) + weights = _weighting._asls(data, baseline.reshape(self._len), p) y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1, use_banded=False) + + # (W.T @ W + P_1) @ y -> P_1 @ y + W.T @ W @ y + self.whittaker_system.add_penalty(penalized_system_1.penalty) + p1_y = penalized_system_1.penalty @ y tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - penalized_system_1.add_diagonal(weight_array * weight_array) + weight_squared = weight_array**2 baseline = self.whittaker_system.solve( - self.whittaker_system.penalty + penalized_system_1.penalty, - penalized_system_1.penalty * y + self.whittaker_system.add_diagonal(weight_squared), + weight_squared * y + p1_y ) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) @@ -421,7 +425,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif """ if not 0 <= eta <= 1: raise ValueError('eta must be between 0 and 1') - elif (np.asarray(diff_order) < 2).any(): + elif np.less(diff_order, 2).any(): raise ValueError('diff_order must be 2 or greater') y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 438afbb..e791ccf 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -184,7 +184,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, d1_y = lambda_1 * d1_y tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - weight_squared = weight_array * weight_array + weight_squared = weight_array**2 baseline = self.whittaker_system.solve( self.whittaker_system.add_diagonal(weight_squared), weight_squared * y + d1_y, overwrite_b=True diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index b9371c8..8c72505 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -78,7 +78,11 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la output = pspline.solve_pspline(y, weights=weights.reshape(y.shape)) assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) - assert_allclose(pspline.coef.flatten(), expected_coeffs, rtol=1e-8, atol=1e-8) + assert_allclose(pspline.coef, expected_coeffs, rtol=1e-8, atol=1e-8) + + # also ensure that the pspline's basis can use the solved coefficients + basis_output = pspline.basis @ pspline.coef + assert_allclose(basis_output, expected_result, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('spline_degree', (1, 2, 3, [2, 3])) @@ -140,6 +144,14 @@ def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam assert isinstance(pspline.x, np.ndarray) assert isinstance(pspline.z, np.ndarray) + # _basis should be None since the basis attribute has not been accessed yet + assert pspline._basis is None + + expected_basis = kron(basis_x, basis_z).toarray() + + assert_allclose(pspline.basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) + assert_allclose(pspline._basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) + def test_pspline_same_basis(data_fixture2d): """Ensures PSpline2D.same_basis works correctly.""" diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 36f18e4..15cc7c3 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -21,7 +21,8 @@ @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) @pytest.mark.parametrize('use_banded', (True, False)) -def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded): +@pytest.mark.parametrize('use_lower', (True, False)) +def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded, use_lower): """ Tests the accuracy of the penalized system solver. @@ -43,7 +44,8 @@ def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded): penalty = P1 + P2 penalized_system = _whittaker_utils.PenalizedSystem2D( - small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded + small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, + use_lower=use_lower ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 @@ -133,7 +135,8 @@ def test_penalized_system_negative_lam_fails(small_data2d, lam): @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) @pytest.mark.parametrize('use_banded', (True, False)) -def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded): +@pytest.mark.parametrize('use_lower', (True, False)) +def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded, use_lower): """ Ensures 2D Whittaker and PSpline outputs are the same for specific condition. @@ -158,7 +161,7 @@ def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded): assert_array_equal(pspline.basis_z.shape, (len(z)), len(z)) whittaker_system = _whittaker_utils.PenalizedSystem2D( - y.shape, lam=lam, diff_order=diff_order, use_banded=use_banded + y.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, use_lower=use_lower ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 From 28c29b3089afc1970f08be4202bb450f7cd42c77 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Wed, 17 Jan 2024 20:01:52 -0500 Subject: [PATCH 31/56] FEAT: Added 2D version of pspline_iasls Also added additional tests to check more difference orders for pspline algorithms. --- pybaselines/two_d/spline.py | 106 ++++++++++++++++++++++++++++++++ tests/test_spline.py | 72 ++++++++++++++++------ tests/two_d/test_spline.py | 116 +++++++++++++++++++++++++----------- 3 files changed, 240 insertions(+), 54 deletions(-) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index ee3ea9f..6fde768 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -16,6 +16,7 @@ from .. import _weighting from ..utils import ParameterWarning, gaussian, relative_difference, _MIN_FLOAT from ._algorithm_setup import _Algorithm2D +from ._whittaker_utils import PenalizedSystem2D from .._compat import _HAS_NUMBA, jit @@ -375,6 +376,111 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif return baseline, params + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, + spline_degree=3, max_iter=50, tol=1e-3, weights=None, diff_order=2): + """ + A penalized spline version of the IAsLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e1. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + lam_1 : float, optional + The smoothing parameter for the first derivative of the residual. Default is 1e-4. + num_knots : int, optional + The number of knots for the spline. Default is 100. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + diff_order : int, optional + The order of the differential matrix. Must be greater than 1. Default is 2 + (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2. + + See Also + -------- + pybaselines.whittaker.iasls + + References + ---------- + He, S., et al. Baseline correction for raman spectra using an improved + asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + elif np.less(diff_order, 2).any(): + raise ValueError('diff_order must be 2 or greater') + + if weights is None: + _, _, pseudo_inverse = self._setup_polynomial( + data, weights=None, poly_order=2, calc_vander=True, calc_pinv=True + ) + baseline = self.vandermonde @ (pseudo_inverse @ data.ravel()) + weights = _weighting._asls(data, baseline.reshape(self._len), p) + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + + # B.T @ P_1 @ B and B.T @ P_1 @ y + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1, use_banded=False) + p1_partial_penalty = self.pspline.basis.T @ penalized_system_1.penalty + + partial_rhs = p1_partial_penalty @ y.ravel() + self.pspline.add_penalty(p1_partial_penalty @ self.pspline.basis) + + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve_pspline(y, weight_array**2, rhs_extra=partial_rhs) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + @_Algorithm2D._register(sort_keys=('weights',)) def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): diff --git a/tests/test_spline.py b/tests/test_spline.py index 278d884..2bb01ca 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -262,9 +262,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.asls, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.asls, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineIAsLS(IterativeSplineTester): @@ -288,11 +291,20 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (2, 3)) + @pytest.mark.parametrize('lam_1', (1e1, 1e3)) + def test_whittaker_comparison(self, lam, lam_1, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.iasls, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.iasls, self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order + ) class TestPsplineAirPLS(IterativeSplineTester): @@ -330,9 +342,10 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.airpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.airpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineArPLS(IterativeSplineTester): @@ -365,9 +378,10 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.arpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.arpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineDrPLS(IterativeSplineTester): @@ -410,14 +424,17 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('eta', (0.2, 0.8)) - def test_whittaker_comparison(self, lam, eta): + @pytest.mark.parametrize('diff_order', (2, 3)) + def test_whittaker_comparison(self, lam, eta, diff_order): """ Ensures the P-spline version is the same as the Whittaker version. Have to use a larger tolerance since pspline_drpls uses interpolation to get the weight at the coefficients' x-values. """ - compare_pspline_whittaker(self, whittaker.drpls, self.y, lam=lam, eta=eta, test_rtol=2e-3) + compare_pspline_whittaker( + self, whittaker.drpls, self.y, lam=lam, eta=eta, diff_order=diff_order, test_rtol=2e-3 + ) @pytest.mark.parametrize('eta', (-1, 2)) def test_outside_eta_fails(self, eta): @@ -425,6 +442,11 @@ def test_outside_eta_fails(self, eta): with pytest.raises(ValueError): self.class_func(self.y, eta=eta) + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + class TestPsplineIArPLS(IterativeSplineTester): """Class for testing pspline_iarpls baseline.""" @@ -465,9 +487,10 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): assert not np.isfinite(params['tol_history'][-1]) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.iarpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.iarpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineAsPLS(IterativeSplineTester): @@ -508,14 +531,21 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """ Ensures the P-spline version is the same as the Whittaker version. Have to use a larger tolerance since pspline_aspls uses interpolation to get the alpha values at the coefficients' x-values. """ - compare_pspline_whittaker(self, whittaker.aspls, self.y, lam=lam, test_rtol=2e-3) + if diff_order == 2: + rtol = 2e-3 + else: + rtol = 5e-2 + compare_pspline_whittaker( + self, whittaker.aspls, self.y, lam=lam, diff_order=diff_order, test_rtol=rtol + ) class TestPsplinePsalsa(IterativeSplineTester): @@ -537,9 +567,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.psalsa, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.psalsa, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineDerpsalsa(IterativeSplineTester): @@ -561,9 +594,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.derpsalsa, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.derpsalsa, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineMPLS(SplineTester, InputWeightsMixin): diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py index c7296ac..ea6bc14 100644 --- a/tests/two_d/test_spline.py +++ b/tests/two_d/test_spline.py @@ -195,11 +195,10 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (1, 2, 3)) + @pytest.mark.parametrize('diff_order', (1, 2, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 2: 1e5, 3: 1e8}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) class TestIRSQR(IterativeSplineTester): @@ -213,11 +212,10 @@ def test_outside_p_fails(self, quantile): with pytest.raises(ValueError): self.class_func(self.y, quantile=quantile) - @pytest.mark.parametrize('diff_order', (1, 2, 3)) + @pytest.mark.parametrize('diff_order', (1, 2, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 2: 1e5, 3: 1e8}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) @pytest.mark.parametrize('has_x', (True, False)) @pytest.mark.parametrize('has_z', (True, False)) @@ -237,17 +235,61 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) - @pytest.mark.parametrize('lam', (1e1, 1e5)) + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'asls', self.y, lam=lam, p=p) + compare_pspline_whittaker(self, 'asls', self.y, lam=lam, p=p, diff_order=diff_order) + + +class TestPsplineIAsLS(IterativeSplineTester): + """Class for testing pspline_iasls baseline.""" + + func_name = 'pspline_iasls' + + @pytest.mark.parametrize('use_instance', (True, False)) + @pytest.mark.parametrize('weight_bool', (True, False)) + def test_unchanged_data(self, use_instance, weight_bool): + """Ensures that input data is unchanged by the function.""" + if weight_bool: + weights = np.ones_like(self.y) + else: + weights = None + super().test_unchanged_data(use_instance, weights=weights) + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[2, 1]) + + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('lam_1', (1e1, [1e1, 1e5])) + @pytest.mark.parametrize('p', (0.01, 0.1)) + @pytest.mark.parametrize('diff_order', (3, [2, 3])) + def test_whittaker_comparison(self, lam, lam_1, p, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker( + self, 'iasls', self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order, test_rtol=1e-5 + ) class TestPsplineAirPLS(IterativeSplineTester): @@ -255,11 +297,10 @@ class TestPsplineAirPLS(IterativeSplineTester): func_name = 'pspline_airpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e3, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @@ -287,10 +328,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): ) assert np.isfinite(baseline).all() - @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'airpls', self.y, lam=lam) + compare_pspline_whittaker(self, 'airpls', self.y, lam=lam, diff_order=diff_order) class TestPsplineArPLS(IterativeSplineTester): @@ -298,11 +340,10 @@ class TestPsplineArPLS(IterativeSplineTester): func_name = 'pspline_arpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) @pytest.mark.skip(reason='test is too slow') def test_avoid_overflow_warning(self, no_noise_data_fixture2d): @@ -325,10 +366,11 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture2d): assert np.isfinite(baseline).all() - @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'arpls', self.y, lam=lam) + compare_pspline_whittaker(self, 'arpls', self.y, lam=lam, diff_order=diff_order) class TestPsplineIArPLS(IterativeSplineTester): @@ -336,11 +378,10 @@ class TestPsplineIArPLS(IterativeSplineTester): func_name = 'pspline_iarpls' - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) @pytest.mark.skip(reason='test is too slow') # ignore the RuntimeWarning that occurs from using +/- inf or nan @@ -372,10 +413,11 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): # this test is actually doing what it should be doing assert not np.isfinite(params['tol_history'][-1]) - @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'iarpls', self.y, lam=lam) + compare_pspline_whittaker(self, 'iarpls', self.y, lam=lam, diff_order=diff_order) class TestPsplinePsalsa(IterativeSplineTester): @@ -389,15 +431,17 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) - @pytest.mark.parametrize('diff_order', (1, 3)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) def test_diff_orders(self, diff_order): """Ensure that other difference orders work.""" - lam = {1: 1e2, 3: 1e10}[diff_order] - self.class_func(self.y, lam=lam, diff_order=diff_order) + self.class_func(self.y, diff_order=diff_order) - @pytest.mark.parametrize('lam', (1e1, 1e5)) + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (2, 3, [2, 3])) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, 'psalsa', self.y, lam=lam, p=p, test_rtol=1e5) + compare_pspline_whittaker( + self, 'psalsa', self.y, lam=lam, p=p, diff_order=diff_order, test_rtol=1e5 + ) From 1518ca7d1e8359ccb90c30394d91c8030151fdeb Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 22 Jan 2024 19:08:55 -0500 Subject: [PATCH 32/56] MAINT: Fixed handling of window values in 2d Fixed handling of array-like inputs for 2d morphological and smoothing algorithms. Also added extrapolation for 2D. --- pybaselines/_validation.py | 34 +++++ pybaselines/two_d/_algorithm_setup.py | 90 ++--------- pybaselines/two_d/_spline_utils.py | 2 +- pybaselines/two_d/morphological.py | 4 +- pybaselines/two_d/optimizers.py | 18 +-- pybaselines/two_d/smooth.py | 18 +-- pybaselines/utils.py | 209 ++++++++++++++++++++++++-- tests/test_utils.py | 130 +++++++++++++++- tests/test_validation.py | 21 +++ tests/two_d/test_algorithm_setup.py | 3 +- tests/two_d/test_morphological.py | 6 + tests/two_d/test_smooth.py | 2 +- 12 files changed, 420 insertions(+), 117 deletions(-) diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index e2211de..10dd131 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -478,3 +478,37 @@ def _check_optional_array(data_size, array=None, dtype=None, order=None, check_f output_array = output_array.copy() return output_array + + +def _get_row_col_values(value, **asarray_kwargs): + """ + Determines the row and column values for an input that can be scalar or up to length 4. + + Parameters + ---------- + value : numpy.number or Sequence[numpy.number, ...] + _description_ + + Returns + ------- + output : numpy.ndarray, shape (4,) + The array of length 4 with values first row, last row, first column, last column. + + Raises + ------ + ValueError + Raised if the input value was a sequence with 1, 2, or 4 values. + + """ + # can either be len 1, 2, or 4 + output, scalar_input = _check_scalar(value, None, **asarray_kwargs) + if scalar_input: + output = np.full(4, output) + else: + len_input = len(output) + if len_input not in (2, 4): + raise ValueError('input must either be a single value or an array with length 2 or 4') + elif len_input == 2: + output = np.array([output[0], output[0], output[1], output[1]]) + + return output diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 1ca9e0c..916b260 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -12,11 +12,10 @@ import warnings import numpy as np -from scipy.ndimage import grey_opening from ..utils import ( - ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, pad_edges2d, - relative_difference + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, optimize_window, + pad_edges2d ) from ._spline_utils import PSpline2D from .._validation import ( @@ -685,11 +684,11 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): if half_window is not None: output_half_window = _check_half_window(half_window, two_d=True) else: - output_half_window = _optimize_window(y, **window_kwargs) + output_half_window = optimize_window(y, **window_kwargs) return y, output_half_window - def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): + def _setup_smooth(self, y, half_window=0, allow_zero=True, hw_multiplier=2, **pad_kwargs): """ Sets the starting parameters for doing smoothing-based algorithms. @@ -705,6 +704,9 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): allow_zero : bool, optional If True (default), allows `half_window` to be 0; otherwise, `half_window` must be at least 1. + hw_multiplier : int, optional + The value to multiply the output of :func:`.optimize_window` if half_window + is None. **pad_kwargs Additional keyword arguments to pass to :func:`.pad_edges` for padding the edges of the data to prevent edge effects from smoothing. @@ -713,10 +715,16 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): ------- numpy.ndarray, shape (``N + 2 * half_window``,) The padded array of data. + output_hw : int + The accepted half window size. """ - hw = _check_half_window(half_window, allow_zero, two_d=False) - return pad_edges2d(y, hw, **pad_kwargs) + if half_window is not None: + output_hw = _check_half_window(half_window, allow_zero, two_d=True) + else: + output_hw = hw_multiplier * optimize_window(y) + + return pad_edges2d(y, output_hw, **pad_kwargs), output_hw def _setup_classification(self, y, weights=None): """ @@ -896,71 +904,3 @@ def _setup_misc(self, y): """ return y - - -# TODO maybe just make a way to merge the 1D and 2D versions -def _optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, - max_half_window=None, min_half_window=None): - """ - Optimizes the morphological half-window size. - - Parameters - ---------- - data : array-like, shape (N,) - The measured data values. - increment : int, optional - The step size for iterating half windows. Default is 1. - max_hits : int, optional - The number of consecutive half windows that must produce the same - morphological opening before accepting the half window as the optimum - value. Default is 3. - window_tol : float, optional - The tolerance value for considering two morphological openings as - equivalent. Default is 1e-6. - max_half_window : int, optional - The maximum allowable half-window size. If None (default), will be set - to (len(data) - 1) / 2. - min_half_window : int, optional - The minimum half-window size. If None (default), will be set to 1. - - Returns - ------- - half_window : int - The optimized half window size. - - Notes - ----- - May only provide good results for some morphological algorithms, so use with - caution. - - References - ---------- - Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for - Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. - - """ - y = np.asarray(data) - if max_half_window is None: - max_half_window = (y.shape[0] - 1) // 2 - if min_half_window is None: - min_half_window = 1 - - # TODO would it be better to allow padding the data? - opening = grey_opening(y, [2 * min_half_window + 1, 2 * min_half_window + 1]) - hits = 0 - best_half_window = min_half_window - for half_window in range(min_half_window + increment, max_half_window, increment): - new_opening = grey_opening(y, [half_window * 2 + 1, half_window * 2 + 1]) - if relative_difference(opening, new_opening) < window_tol: - if hits == 0: - # keep just the first window that fits tolerance - best_half_window = half_window - increment - hits += 1 - if hits >= max_hits: - half_window = best_half_window - break - elif hits: - hits = 0 - opening = new_opening - - return max(half_window, 1) # ensure half window is at least 1 diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 758d949..f1816e8 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -11,7 +11,7 @@ from scipy.sparse.linalg import spsolve from .._spline_utils import _spline_basis, _spline_knots -from .._validation import _check_array, _check_lam, _check_scalar +from .._validation import _check_array, _check_scalar from ._whittaker_utils import PenalizedSystem2D diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index ef73c24..e90f886 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -65,7 +65,7 @@ def mor(self, data, half_window=None, **window_kwargs): """ y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) - opening = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + opening = grey_opening(y, 2 * half_wind + 1) baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) return baseline, {'half_window': half_wind} @@ -277,7 +277,7 @@ def tophat(self, data, half_window=None, **window_kwargs): """ y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) - baseline = grey_opening(y, [2 * half_wind + 1, 2 * half_wind + 1]) + baseline = grey_opening(y, 2 * half_wind + 1) return baseline, {'half_window': half_wind} diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py index 99bcacd..43b1330 100644 --- a/pybaselines/two_d/optimizers.py +++ b/pybaselines/two_d/optimizers.py @@ -15,7 +15,7 @@ from . import classification, morphological, polynomial, spline, whittaker from ._algorithm_setup import _Algorithm2D -from .._validation import _check_optional_array +from .._validation import _check_optional_array, _get_row_col_values from ..utils import _check_scalar, _sort_array2d @@ -217,20 +217,8 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, # use high weighting rather than Lagrange multipliers to constrain the points # to better work with noisy data - # allow either 4 or 2 inputs for constrained weight and fraction - try: - weightings = _check_scalar(constrained_weight, 4, True)[0] - except ValueError: - weightings = _check_scalar(constrained_weight, 2, True)[0] - weightings = np.array([weightings[0], weightings[0], weightings[1], weightings[1]]) - try: - constrained_fractions = _check_scalar(constrained_fraction, 4, True)[0] - except ValueError: - constrained_fractions = _check_scalar(constrained_fraction, 2, True)[0] - constrained_fractions = np.array([ - constrained_fractions[0], constrained_fractions[0], - constrained_fractions[1], constrained_fractions[1] - ]) + weightings = _get_row_col_values(constrained_weight) + constrained_fractions = _get_row_col_values(constrained_fraction) if np.any(constrained_fractions < 0) or np.any(constrained_fractions > 1): raise ValueError('constrained_fraction must be between 0 and 1') diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py index b0656e7..8bbe445 100644 --- a/pybaselines/two_d/smooth.py +++ b/pybaselines/two_d/smooth.py @@ -6,9 +6,11 @@ """ +import numpy as np from scipy.ndimage import gaussian_filter, median_filter -from ._algorithm_setup import _Algorithm2D, _optimize_window +from ._algorithm_setup import _Algorithm2D + class _Smooth(_Algorithm2D): """A base class for all smoothing algorithms.""" @@ -38,7 +40,7 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No The standard deviation of the smoothing Gaussian kernel. Default is None, which will use (2 * `smooth_half_window` + 1) / 6. **pad_kwargs - Additional keyword arguments to pass to :func:`.pad_edges` for padding + Additional keyword arguments to pass to :func:`.pad_edges2d` for padding the edges of the data to prevent edge effects from convolution. Returns @@ -54,15 +56,11 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No artifacts. J. Biomolecular NMR, 1995, 5, 147-153. """ - if half_window is None: - half_window = 2 * _optimize_window(data) + y, half_window = self._setup_smooth(data, half_window, False, 2, **pad_kwargs) window_size = 2 * half_window + 1 - median = median_filter( - self._setup_smooth(data, half_window, **pad_kwargs), - [window_size, window_size], mode='nearest' - ) + median = median_filter(y, window_size, mode='nearest') if smooth_half_window is None: - smooth_window = window_size + smooth_window = np.mean(window_size) # truncate can only be a single value else: smooth_window = 2 * smooth_half_window + 1 if sigma is None: @@ -70,4 +68,4 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No sigma = smooth_window / 6 baseline = gaussian_filter(median, sigma, truncate=smooth_window) # TODO check truncate value - return baseline[half_window:-half_window, half_window:-half_window], {} + return baseline[half_window[0]:-half_window[0], half_window[1]:-half_window[1]], {} diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 10fd6eb..a735b6b 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -16,7 +16,9 @@ from ._banded_utils import PenalizedSystem, difference_matrix as _difference_matrix from ._compat import jit from ._spline_utils import PSpline -from ._validation import _check_array, _check_scalar, _check_optional_array, _yx_arrays +from ._validation import ( + _check_array, _check_scalar, _check_optional_array, _get_row_col_values, _yx_arrays +) # the minimum positive float values such that a + _MIN_FLOAT != a @@ -308,11 +310,194 @@ def pad_edges(data, pad_length, mode='extrapolate', return padded_data -def pad_edges2d(data, pad_length, *args, mode='edge', **kwargs): - if not _check_scalar(pad_length, None)[1]: - raise NotImplementedError('separate pad lengths not yet supported') +def _extrapolate2d(y, total_padding, extrapolate_window=None): + """ + Extrapolates each edge of two dimensional data. + + Corners are calculated by averaging linear fits of the extended data. + + Parameters + ---------- + y : numpy.ndarray + _description_ + total_padding : Sequence[int, int, int, int] + The padding for the top, bottom, left, and right. The padding of top and + bottom are assumed to be equal, as are the left and right. + extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional + The number of values to use for linear fitting on the top, bottom, left, and right + edges. Default is None, which will set the extrapolate window size equal + to `total_padding`. + + Returns + ------- + output : numpy.ndarray + The data with padding + + Raises + ------ + NotImplementedError + Raised if any value in `total_padding` is zero. + ValueError + Raised if any extrapolation window is less than 1. + + Notes + ----- + Uses the Moore-Penrose pseudo-inverse to speed up the calculation of the linear fits + for each edge. Using the Vandermonde with `numpy.linalg.lstsq` would also work but is + a little slower. + + """ + if np.equal(total_padding, 0).any(): + raise NotImplementedError('pad length of 0 is not supported in 2D') + elif np.less(total_padding, 0).any(): + raise ValueError('pad length must be greater or equal to 0') + + if extrapolate_window is None: + extrapolate_windows = total_padding else: - return pad_edges(data, pad_length, *args, mode=mode, **kwargs) + extrapolate_windows = _get_row_col_values(extrapolate_window).reshape((2, 2)) + + if np.less_equal(extrapolate_windows, 0).any(): + raise ValueError('extrapolate_window must be greater than 0') + # pad length for left and right or top and bottom should be equal, so ignore the repeats + total_padding = [total_padding[0][0], total_padding[1][0]] + + output = np.empty( + (y.shape[0] + total_padding[0] * 2, y.shape[1] + total_padding[1] * 2) + ) + output[total_padding[0]:-total_padding[0], total_padding[1]:-total_padding[1]] = y + + x = np.arange(y.shape[0] + 2 * total_padding[0]) + z = np.arange(y.shape[1] + 2 * total_padding[1]) + + vander_x = np.polynomial.polynomial.polyvander(x, 1) + vander_z = np.polynomial.polynomial.polyvander(z, 1) + pinv_top = np.linalg.pinv( + vander_x[total_padding[0]:-total_padding[0]][:extrapolate_windows[0][0]] + ) + pinv_bottom = np.linalg.pinv( + vander_x[total_padding[0]:-total_padding[0]][-extrapolate_windows[0][1]:] + ) + pinv_left = np.linalg.pinv( + vander_z[total_padding[1]:-total_padding[1]][:extrapolate_windows[1][0]] + ) + pinv_right = np.linalg.pinv( + vander_z[total_padding[1]:-total_padding[1]][-extrapolate_windows[1][1]:] + ) + + top = vander_x[:total_padding[0]] @ (pinv_top @ y[:extrapolate_windows[0][0]]) + bottom = vander_x[-total_padding[0]:] @ (pinv_bottom @ y[-extrapolate_windows[0][1]:]) + + output[:total_padding[0], total_padding[1]:-total_padding[1]] = top + output[-total_padding[0]:, total_padding[1]:-total_padding[1]] = bottom + + left = vander_z[:total_padding[1]] @ (pinv_left @ y[:, :extrapolate_windows[1][0]].T) + right = vander_z[-total_padding[1]:] @ (pinv_right @ y[:, -extrapolate_windows[1][1]:].T) + + output[total_padding[0]:-total_padding[0], :total_padding[1]] = left.T + output[total_padding[0]:-total_padding[0], -total_padding[1]:] = right.T + + # now fill the corners by averaging the extensions of the corners + top_left = vander_z[:total_padding[1]] @ ( + pinv_left @ output[ + :total_padding[0], total_padding[1]:-total_padding[1] + ][:, :extrapolate_windows[1][0]].T + ) + top_right = vander_z[-total_padding[1]:] @ ( + pinv_right @ output[ + :total_padding[0], total_padding[1]:-total_padding[1] + ][:, -extrapolate_windows[1][1]:].T + ) + + bottom_left = vander_z[:total_padding[1]] @ ( + pinv_left @ output[ + -total_padding[0]:, total_padding[1]:-total_padding[1] + ][:, :extrapolate_windows[1][0]].T + ) + bottom_right = vander_z[-total_padding[1]:] @ ( + pinv_right @ output[ + -total_padding[0]:, total_padding[1]:-total_padding[1] + ][:, -extrapolate_windows[1][1]:].T + ) + + left_top = vander_x[:total_padding[0]] @ ( + pinv_top @ output[ + total_padding[0]:-total_padding[0], :total_padding[1] + ][:extrapolate_windows[0][0]] + ) + left_bottom = vander_x[-total_padding[0]:] @ ( + pinv_bottom @ output[ + total_padding[0]:-total_padding[0], :total_padding[1]: + ][-extrapolate_windows[0][1]:] + ) + + right_top = vander_x[:total_padding[0]] @ ( + pinv_top @ output[ + total_padding[0]:-total_padding[0], -total_padding[1]: + ][:extrapolate_windows[0][0]] + ) + right_bottom = vander_x[-total_padding[0]:] @ ( + pinv_bottom @ output[ + total_padding[0]:-total_padding[0], -total_padding[1]: + ][-extrapolate_windows[0][1]:] + ) + + output[:total_padding[0], :total_padding[1]] = 0.5 * (top_left.T + left_top) + output[:total_padding[0], -total_padding[1]:] = 0.5 * (top_right.T + right_top) + output[-total_padding[0]:, :total_padding[1]] = 0.5 * (bottom_left.T + left_bottom) + output[-total_padding[0]:, -total_padding[1]:] = 0.5 * (bottom_right.T + right_bottom) + + return output + + +def pad_edges2d(data, pad_length, mode='edge', extrapolate_window=None, **pad_kwargs): + """ + Adds left, right, top, and bottom edges to the data. + + Parameters + ---------- + data : array-like, shape (M, N) + The 2D array of the data. + pad_length : int or Sequence[int, int] + The number of points to add to the top, bottom, left, and right edges. If a single + value is given, all edges have the same padding. If a sequence of two values is + given, the first value will be the padding on the top and bottom (rows), and the second + value will pad the left and right (columns). + mode : str or Callable, optional + The method for padding. Default is 'edge'. Any method other than + 'extrapolate' will use :func:`numpy.pad`. + extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional + The number of values to use for linear fitting on the top, bottom, left, and right + edges. Default is None, which will set the extrapolate window size equal + to `pad_length`. + **pad_kwargs + Any keyword arguments to pass to :func:`numpy.pad`, which will be used if `mode` + is not 'extrapolate'. + + Returns + ------- + padded_data : numpy.ndarray + The data with padding on the top, bottom, left, and right edges. + + Notes + ----- + If mode is 'extrapolate', then each edge will be extended by linear fits along each + row and column, and the corners are calculated by averaging the linear sections. + + """ + y = np.asarray(data) + if y.ndim != 2: + raise ValueError('input data must be two dimensional') + total_padding = _get_row_col_values(pad_length).reshape((2, 2)) + + if isinstance(mode, str): + mode = mode.lower() + if mode == 'extrapolate': + output = _extrapolate2d(y, total_padding, extrapolate_window) + else: + output = np.pad(data, total_padding, mode=mode, **pad_kwargs) + + return output def padded_convolve(data, kernel, mode='reflect', **pad_kwargs): @@ -578,16 +763,18 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, """ y = np.asarray(data) if max_half_window is None: - max_half_window = (y.shape[0] - 1) // 2 + max_half_window = (y.shape[-1] - 1) // 2 if min_half_window is None: min_half_window = 1 + y_dims = y.ndim # TODO would it be better to allow padding the data? - opening = grey_opening(y, [2 * min_half_window + 1]) + opening = grey_opening(y, [2 * min_half_window + 1] * y_dims) hits = 0 + half_window = 1 # in case min_half_window is set incorrectly best_half_window = min_half_window for half_window in range(min_half_window + increment, max_half_window, increment): - new_opening = grey_opening(y, [half_window * 2 + 1]) + new_opening = grey_opening(y, [half_window * 2 + 1] * y_dims) if relative_difference(opening, new_opening) < window_tol: if hits == 0: # keep just the first window that fits tolerance @@ -600,7 +787,11 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, hits = 0 opening = new_opening - return max(half_window, 1) # ensure half window is at least 1 + if y_dims == 2: + output = np.maximum([half_window, half_window], [1, 1]) + else: + output = max(half_window, 1) # ensure half window is at least 1 + return output def _inverted_sort(sort_order): diff --git a/tests/test_utils.py b/tests/test_utils.py index 30449a7..df663c5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -407,7 +407,9 @@ def test_pad_edges_extrapolate_windows(): input_array[-10:] = 1. extrapolate_windows = [40, 10] pad_len = 20 - output = utils.pad_edges(input_array, pad_len, extrapolate_window=extrapolate_windows) + output = utils.pad_edges( + input_array, pad_len, mode='extrapolate', extrapolate_window=extrapolate_windows + ) assert_allclose(output[:pad_len], np.full(pad_len, 0.), 1e-14) assert_allclose(output[-pad_len:], np.full(pad_len, 1.), 1e-14) @@ -417,7 +419,9 @@ def test_pad_edges_extrapolate_windows(): def test_pad_edges_extrapolate_zero_window(extrapolate_window): """Ensures an extrapolate_window <= 0 raises an exception.""" with pytest.raises(ValueError): - utils.pad_edges(np.arange(10), 10, extrapolate_window=extrapolate_window) + utils.pad_edges( + np.arange(10), 10, mode='extrapolate', extrapolate_window=extrapolate_window + ) @pytest.mark.parametrize('pad_mode', ('reflect', 'extrapolate')) @@ -445,7 +449,7 @@ def test_pad_edges_custom_pad_func(): actual_output = utils.pad_edges(input_array, pad_length, pad_func, pad_val=pad_val) - assert_array_equal(actual_output, expected_output) + assert_allclose(actual_output, expected_output, rtol=1e-12, atol=0) def test_get_edges_custom_pad_func(): @@ -501,6 +505,111 @@ def test_get_edges(pad_mode, pad_length, list_input, data_fixture): assert_allclose(right, expected_right) +@pytest.mark.parametrize( + 'pad_mode', ('reflect', 'REFLECT', 'extrapolate', 'edge', 'constant', pad_func) +) +@pytest.mark.parametrize('pad_length', (1, 2, 20, 53)) +@pytest.mark.parametrize('list_input', (False, True)) +def test_pad_edges2d(pad_mode, pad_length, list_input, data_fixture2d): + """Tests various inputs for utils.pad_edges2d.""" + *_, data = data_fixture2d + data_shape = data.shape + if list_input: + data = data.tolist() + + if not callable(pad_mode): + np_pad_mode = pad_mode.lower() + else: + np_pad_mode = pad_mode + if np_pad_mode != 'extrapolate': + expected_output = np.pad(data, pad_length, np_pad_mode) + else: + expected_output = None + + output = utils.pad_edges2d(data, pad_length, pad_mode) + assert isinstance(output, np.ndarray) + assert output.ndim == 2 + assert output.shape[0] == data_shape[0] + 2 * pad_length + assert output.shape[1] == data_shape[1] + 2 * pad_length + + if expected_output is not None: + assert_allclose(output, expected_output) + + +@pytest.mark.parametrize('pad_length', (0, 1, 2, 20, 53)) +@pytest.mark.parametrize('extrapolate_window', (None, 1, 2, 10, 1001, (10, 20), (1, 1))) +@pytest.mark.parametrize('list_input', (False, True)) +def test_pad_edges2d_extrapolate(pad_length, list_input, extrapolate_window, data_fixture2d): + """Ensures extrapolation works for utils.pad_edges.""" + *_, data = data_fixture2d + data_shape = data.shape + if list_input: + data = data.tolist() + + if np.less_equal(pad_length, 0).any(): + with pytest.raises(NotImplementedError): + utils.pad_edges2d(data, pad_length, 'extrapolate', extrapolate_window) + else: + output = utils.pad_edges2d(data, pad_length, 'extrapolate', extrapolate_window) + assert isinstance(output, np.ndarray) + assert output.shape[0] == data_shape[0] + 2 * pad_length + assert output.shape[1] == data_shape[1] + 2 * pad_length + + +def test_pad_edges2d_extrapolate_windows(): + """Ensures the separate extrapolate windows are correctly interpreted.""" + input_array = np.zeros(400).reshape(20, 20) + input_array[-10:] = 1. + extrapolate_windows = [5, 10] + pad_len = 5 + output = utils.pad_edges2d( + input_array, pad_len, mode='extrapolate', extrapolate_window=extrapolate_windows + ) + + assert_allclose( + output[:pad_len, pad_len:-pad_len], np.full((pad_len, input_array.shape[1]), 0.), 1e-14 + ) + assert_allclose( + output[-pad_len:, pad_len:-pad_len], np.full((pad_len, input_array.shape[1]), 1.), 1e-14 + ) + + +@pytest.mark.parametrize('extrapolate_window', (0, -2, (0, 0), (5, 0), (5, -1))) +def test_pad_edges2d_extrapolate_zero_window(small_data2d, extrapolate_window): + """Ensures an extrapolate_window <= 0 raises an exception.""" + with pytest.raises(ValueError): + utils.pad_edges2d( + small_data2d, 10, mode='extrapolate', extrapolate_window=extrapolate_window + ) + + +@pytest.mark.parametrize('pad_mode', ('reflect', 'extrapolate')) +def test_pad_edges2d_negative_pad_length(pad_mode, data_fixture2d): + """Ensures a negative pad length raises an exception.""" + with pytest.raises(ValueError): + utils.pad_edges2d(data_fixture2d[-1], -5, pad_mode) + + +def test_pad_edges2d_custom_pad_func(): + """Ensures pad_edges works with a callable padding function, same as numpy.pad.""" + input_array = np.arange(2000).reshape(50, 40) + pad_val = 20 + pad_length = 10 + + expected_output = np.empty( + (input_array.shape[0] + 2 * pad_length, input_array.shape[1] + 2 * pad_length) + ) + expected_output[:pad_length] = pad_val + expected_output[-pad_length:] = pad_val + expected_output[:, :pad_length] = pad_val + expected_output[:, -pad_length:] = pad_val + expected_output[pad_length:-pad_length, pad_length:-pad_length] = input_array + + actual_output = utils.pad_edges(input_array, pad_length, pad_func, pad_val=pad_val) + + assert_allclose(actual_output, expected_output, rtol=1e-12, atol=0) + + @pytest.mark.parametrize('seed', (123, 98765)) def test_invert_sort(seed): """Ensures the inverted sort works.""" @@ -665,3 +774,18 @@ def test_pspline_smooth(data_fixture, diff_order, num_knots, spline_degree): recreated_spline = BSpline(*tck)(x) assert_allclose(recreated_spline, output, rtol=1e-10) + + +@pytest.mark.parametrize('two_d', (True, False)) +def test_optimize_window(small_data2d, two_d): + """Ensures optimize_window has the correct outputs for the dimesions of the input.""" + data = small_data2d + if not two_d: + data = data.flatten() + + output = utils.optimize_window(data) + if two_d: + assert output.shape == (2,) + assert isinstance(output, np.ndarray) + else: + assert isinstance(output, int) diff --git a/tests/test_validation.py b/tests/test_validation.py index 360c82e..c0712c5 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -457,3 +457,24 @@ def test_optional_array_no_input(): assert isinstance(output, np.ndarray) assert_array_equal(output, np.ones(length)) + + +def test_get_row_col_values(): + """Ensures multiple inputs can work for _get_row_col_values.""" + assert_array_equal(_validation._get_row_col_values(1), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values(np.array(1)), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values(np.array([1])), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values([1.1]), [1.1, 1.1, 1.1, 1.1]) + assert_array_equal(_validation._get_row_col_values([[1.1]]), [1.1, 1.1, 1.1, 1.1]) + assert_array_equal(_validation._get_row_col_values([1, 2]), [1, 1, 2, 2]) + assert_array_equal(_validation._get_row_col_values([[1], [2]]), [1, 1, 2, 2]) + assert_array_equal(_validation._get_row_col_values(np.array([1, 2, 3, 4])), [1, 2, 3, 4]) + assert_array_equal(_validation._get_row_col_values([1, 2, 3, 4]), [1, 2, 3, 4]) + assert_array_equal(_validation._get_row_col_values([[1, 2], [3, 4]]), [1, 2, 3, 4]) + + +@pytest.mark.parametrize('values', ([1, 2, 3], [1, 2, 3, 4, 5])) +def test_get_row_col_values_fails(values): + """Ensures _get_row_col_values raises an exception with incorrectly sized inputs..""" + with pytest.raises(ValueError): + _validation._get_row_col_values(values) diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 7f548cd..4cea347 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -214,10 +214,11 @@ def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, incl def test_setup_smooth_shape(small_data2d, algorithm): """Ensures output y is correctly padded.""" pad_length = 4 - y = algorithm._setup_smooth(small_data2d, pad_length, mode='edge') + y, hw = algorithm._setup_smooth(small_data2d, pad_length, mode='edge') assert_array_equal( y.shape, (small_data2d.shape[0] + 2 * pad_length, small_data2d.shape[1] + 2 * pad_length) ) + assert_array_equal(hw, [pad_length, pad_length]) @pytest.mark.parametrize('num_knots', (10, 30, (20, 30))) diff --git a/tests/two_d/test_morphological.py b/tests/two_d/test_morphological.py index 33a9ba7..a54fba4 100644 --- a/tests/two_d/test_morphological.py +++ b/tests/two_d/test_morphological.py @@ -6,6 +6,7 @@ """ +import numpy as np import pytest from pybaselines.two_d import morphological @@ -20,6 +21,11 @@ class MorphologicalTester(BaseTester2D): algorithm_base = morphological._Morphological checked_keys = ('half_window',) + @pytest.mark.parametrize('half_window', (None, 10, [10, 12], np.array([12, 10]))) + def test_half_window(self, half_window): + """Ensures that different inputs for half_window work.""" + self.class_func(self.y, half_window=half_window) + class IterativeMorphologicalTester(MorphologicalTester): """Base testing class for iterative morphological functions.""" diff --git a/tests/two_d/test_smooth.py b/tests/two_d/test_smooth.py index 47d262b..d23a099 100644 --- a/tests/two_d/test_smooth.py +++ b/tests/two_d/test_smooth.py @@ -32,7 +32,7 @@ def test_unchanged_data(self, new_instance, smooth_hw): """Ensures that input data is unchanged by the function.""" super().test_unchanged_data(new_instance, smooth_half_window=smooth_hw) - @pytest.mark.parametrize('half_window', (None, 15)) + @pytest.mark.parametrize('half_window', (None, 15, [15, 15])) def test_half_windows(self, half_window): """Tests possible inputs for `half_window`.""" self.class_func(self.y, half_window=half_window) From d83685304c640b1efda9c615aa3ddda840bdaecb Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 25 Jan 2024 20:41:16 -0500 Subject: [PATCH 33/56] DOCS: Fix method references in docs to point to Baseline or Baseline2D Also switched from napoleon to numpydoc for documenting docstrings. Fixed reference numbers throughout. Re-enabled the autosection label and just ignore the repeated header warnings from the changelog. Fixed method role to point to Baseline or Baseline2D. Deleted two_d.classification since I probably will not make any before the next release. --- docs/algorithms/classification.rst | 16 +- docs/algorithms/index.rst | 2 +- docs/algorithms/misc.rst | 4 +- docs/algorithms/morphological.rst | 24 +-- docs/algorithms/optimizers.rst | 6 +- docs/algorithms/polynomial.rst | 174 +++--------------- docs/algorithms/smooth.rst | 14 +- docs/algorithms/spline.rst | 26 +-- docs/algorithms/whittaker.rst | 22 +-- docs/conf.py | 31 +++- docs/installation.rst | 20 +- .../classification/plot_classifier_masks.py | 2 +- .../plot_fastchrom_threshold.py | 2 +- .../general/plot_algorithm_convergence.py | 2 +- examples/general/plot_noisy_data.py | 4 +- examples/misc/plot_beads_preprocessing.py | 2 +- .../morphological/plot_half_window_effects.py | 2 +- examples/spline/plot_lam_vs_num_knots.py | 2 +- examples/spline/plot_pspline_whittaker.py | 4 +- examples/whittaker/plot_lam_effects.py | 2 +- examples/whittaker/plot_lam_vs_data_size.py | 2 +- examples/whittaker/plot_whittaker_solvers.py | 2 +- pybaselines/__init__.py | 1 + pybaselines/_algorithm_setup.py | 21 ++- pybaselines/_banded_utils.py | 7 +- pybaselines/_spline_utils.py | 2 +- pybaselines/classification.py | 4 +- pybaselines/misc.py | 14 +- pybaselines/morphological.py | 4 +- pybaselines/optimizers.py | 9 +- pybaselines/spline.py | 8 +- pybaselines/two_d/__init__.py | 8 +- pybaselines/two_d/_algorithm_setup.py | 21 ++- pybaselines/two_d/_spline_utils.py | 2 +- pybaselines/two_d/_whittaker_utils.py | 7 +- pybaselines/two_d/classification.py | 13 -- pybaselines/two_d/optimizers.py | 11 +- pybaselines/two_d/polynomial.py | 62 +++---- pybaselines/two_d/spline.py | 2 +- pybaselines/two_d/whittaker.py | 2 +- pybaselines/whittaker.py | 8 +- 41 files changed, 222 insertions(+), 349 deletions(-) delete mode 100644 pybaselines/two_d/classification.py diff --git a/docs/algorithms/classification.rst b/docs/algorithms/classification.rst index 2de976f..da53dc7 100644 --- a/docs/algorithms/classification.rst +++ b/docs/algorithms/classification.rst @@ -65,7 +65,7 @@ Algorithms dietrich (Dietrich's Classification Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.dietrich` calculates the power spectrum of the data as the squared derivative +:meth:`~.Baseline.dietrich` calculates the power spectrum of the data as the squared derivative of the data. Then baseline points are identified by iteratively removing points where the mean of the power spectrum is less a multiple of the standard deviation of the power spectrum. The baseline is created by first interpolating through all baseline @@ -197,7 +197,7 @@ points, and then iteratively fitting a polynomial to the interpolated baseline. golotvin (Golotvin's Classification Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.golotvin` divides the data into sections and takes the minimum standard +:meth:`~.Baseline.golotvin` divides the data into sections and takes the minimum standard deviation of all the sections as the noise's standard deviation for the entire data. Then classifies any point where the rolling max minus min is less than a multiple of the noise's standard deviation as belonging to the baseline. @@ -224,7 +224,7 @@ the noise's standard deviation as belonging to the baseline. std_distribution (Standard Deviation Distribution) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.std_distribution` identifies baseline segments by analyzing the rolling +:meth:`~.Baseline.std_distribution` identifies baseline segments by analyzing the rolling standard deviation distribution. The rolling standard deviations are split into two distributions, with the smaller distribution assigned to noise. Baseline points are then identified as any point where the rolled standard deviation is less than a multiple @@ -253,8 +253,8 @@ of the median of the noise's standard deviation distribution. fastchrom (FastChrom's Baseline Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.fastchrom` identifies baseline segments by analyzing the rolling standard -deviation distribution, similar to :meth:`.std_distribution`. Baseline points are +:meth:`~.Baseline.fastchrom` identifies baseline segments by analyzing the rolling standard +deviation distribution, similar to :meth:`~.Baseline.std_distribution`. Baseline points are identified as any point where the rolling standard deviation is less than the specified threshold, and peak regions are iteratively interpolated until the baseline is below the data. @@ -279,7 +279,7 @@ threshold, and peak regions are iteratively interpolated until the baseline is b cwt_br (Continuous Wavelet Transform Baseline Recognition) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.cwt_br` identifies baseline segments by performing a continous wavelet +:meth:`~.Baseline.cwt_br` identifies baseline segments by performing a continous wavelet transform (CWT) on the input data at various scales, and picks the scale with the first local minimum in the Shannon entropy. The threshold for baseline points is obtained by fitting a Gaussian to the histogram of the CWT at the optimal scale, and the final baseline is fit @@ -315,8 +315,8 @@ other points have a weight of 0. fabc (Fully Automatic Baseline Correction) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.fabc` identifies baseline segments by thresholding the squared first derivative -of the data, similar to :meth:`.dietrich`. However, fabc approximates the first derivative +:meth:`~.Baseline.fabc` identifies baseline segments by thresholding the squared first derivative +of the data, similar to :meth:`~.Baseline.dietrich`. However, fabc approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust to noise than the numerical derivative in Dietrich's method. The baseline is then fit using Whittaker smoothing with all baseline points having a weight of 1 and all other points diff --git a/docs/algorithms/index.rst b/docs/algorithms/index.rst index 3594d1a..c4416d2 100644 --- a/docs/algorithms/index.rst +++ b/docs/algorithms/index.rst @@ -5,7 +5,7 @@ Algorithms The currently available baseline correction algorithms in pybaselines are split into polynomial, whittaker, morphological, smooth, spline, classification, optimizers, and miscellaneous (misc). Note that this is more for grouping code and not meant as -a hard-classification of the algorithms. +a hard-classification of the algorithms or the general field of baseline correction. This section of the documentation is to help provide some context for each algorithm. In addition, most algorithms will have a figure that shows how well the algorithm fits diff --git a/docs/algorithms/misc.rst b/docs/algorithms/misc.rst index 3dcd6ff..c5ec012 100644 --- a/docs/algorithms/misc.rst +++ b/docs/algorithms/misc.rst @@ -11,7 +11,7 @@ Algorithms interp_pts (Interpolation between points) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.interp_pts` interpolates between input points using line segments +:meth:`~.Baseline.interp_pts` interpolates between input points using line segments or splines of different orders. The function is mainly intended for usage with user interfaces and is not encouraged otherwise. @@ -70,7 +70,7 @@ since it solely depends on the user-defined anchor points. beads (Baseline Estimation And Denoising with Sparsity) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.beads` decomposes the input data into baseline and pure, noise-free signal by +:meth:`~.Baseline.beads` decomposes the input data into baseline and pure, noise-free signal by modeling the baseline as a low pass filter and by considering the signal and its derivatives as sparse. diff --git a/docs/algorithms/morphological.rst b/docs/algorithms/morphological.rst index f634614..52202da 100644 --- a/docs/algorithms/morphological.rst +++ b/docs/algorithms/morphological.rst @@ -25,12 +25,12 @@ Algorithms mpls (Morphological Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mpls` uses both morphological operations and Whittaker-smoothing +:meth:`~.Baseline.mpls` uses both morphological operations and Whittaker-smoothing to create the baseline. First, a morphological opening is performed on the data. Then, the index of the minimum data value between each flat region of the opened data is selected as a baseline anchor point and given a weighting of :math:`1 - p`, while all other points are given a weight of :math:`p`. The data -and weights are then used to calculate the baseline, similar to the :meth:`.asls` +and weights are then used to calculate the baseline, similar to the :meth:`~.Baseline.asls` method. .. plot:: @@ -156,7 +156,7 @@ method. mor (Morphological) ~~~~~~~~~~~~~~~~~~~ -:meth:`.mor` performs a morphological opening on the data and then selects +:meth:`~.Baseline.mor` performs a morphological opening on the data and then selects the element-wise minimum between the opening and the average of a morphological erosion and dilation of the opening to create the baseline. @@ -183,7 +183,7 @@ erosion and dilation of the opening to create the baseline. imor (Improved Morphological) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.imor` is an attempt to improve the mor method, and iteratively selects the element-wise +:meth:`~.Baseline.imor` is an attempt to improve the mor method, and iteratively selects the element-wise minimum between the original data and the average of a morphological erosion and dilation of the opening of either the data (first iteration) or previous iteration's baseline to create the baseline. @@ -202,7 +202,7 @@ create the baseline. mormol (Morphological and Mollified Baseline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mormol` iteratively convolves the erosion of the data with a mollifying (smoothing) +:meth:`~.Baseline.mormol` iteratively convolves the erosion of the data with a mollifying (smoothing) kernel, to produce a smooth baseline. .. plot:: @@ -225,7 +225,7 @@ kernel, to produce a smooth baseline. amormol (Averaging Morphological and Mollified Baseline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.amormol` iteratively convolves a mollifying (smoothing) kernel with the +:meth:`~.Baseline.amormol` iteratively convolves a mollifying (smoothing) kernel with the element-wise minimum of the data and the average of the morphological closing and opening of either the data (first iteration) or previous iteration's baseline. @@ -243,7 +243,7 @@ and opening of either the data (first iteration) or previous iteration's baselin rolling_ball (Rolling Ball) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.rolling_ball` performs a morphological opening on the data and +:meth:`~.Baseline.rolling_ball` performs a morphological opening on the data and then smooths the result with a moving average, giving a baseline that resembles rolling a ball across the data. @@ -265,7 +265,7 @@ resembles rolling a ball across the data. mwmv (Moving Window Minimum Value) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mwmv` performs a morphological erosion on the data and +:meth:`~.Baseline.mwmv` performs a morphological erosion on the data and then smooths the result with a moving average. .. plot:: @@ -286,7 +286,7 @@ then smooths the result with a moving average. tophat (Top-hat Transformation) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.tophat` performs a morphological opening on the data. +:meth:`~.Baseline.tophat` performs a morphological opening on the data. .. note:: The baseline from the tophat method is not smooth. Smoothing is left to the @@ -311,13 +311,13 @@ tophat (Top-hat Transformation) mpspline (Morphology-Based Penalized Spline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mpspline` uses both morphological operations and penalized splines +:meth:`~.Baseline.mpspline` uses both morphological operations and penalized splines to create the baseline. First, the data is smoothed by fitting a penalized spline to the closing of the data with a window of 3. Then baseline points are identified where the smoothed data is equal to the element-wise minimum between the opening of the smoothed data and the average of a morphological erosion and dilation of the opening. The baseline points are given a weighting of :math:`1 - p`, while all -other points are given a weight of :math:`p`, similar to the :meth:`.mpls` method. +other points are given a weight of :math:`p`, similar to the :meth:`~.Baseline.mpls` method. Finally, a penalized spline is fit to the smoothed data with the assigned weighting. .. plot:: @@ -349,7 +349,7 @@ Finally, a penalized spline is fit to the smoothed data with the assigned weight jbcd (Joint Baseline Correction and Denoising) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.jbcd` uses regularized least-squares fitting combined with morphological operations +:meth:`~.Baseline.jbcd` uses regularized least-squares fitting combined with morphological operations to simultaneously obtain the baseline and denoised signal. Minimized function: diff --git a/docs/algorithms/optimizers.rst b/docs/algorithms/optimizers.rst index c67182c..77a0b46 100644 --- a/docs/algorithms/optimizers.rst +++ b/docs/algorithms/optimizers.rst @@ -11,7 +11,7 @@ Algorithms optimize_extended_range ~~~~~~~~~~~~~~~~~~~~~~~ -The :meth:`.optimize_extended_range` function is based on the `Extended Range +The :meth:`~.Baseline.optimize_extended_range` function is based on the `Extended Range Penalized Least Squares (erPLS) method `_, but extends its usage to all Whittaker-smoothing-based, polynomial, and spline algorithms. @@ -201,7 +201,7 @@ added linear regions is selected as the optimal parameter. collab_pls (Collaborative Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.collab_pls` is intended for fitting multiple datasets of related data, +:meth:`~.Baseline.collab_pls` is intended for fitting multiple datasets of related data, and can use any Whittaker-smoothing-based or spline method. The general idea is that using multiple sets of data should be better able to estimate the overall baseline rather than individually fitting each set of data. @@ -258,7 +258,7 @@ since it requires multiple sets of data for each baseline type. adaptive_minmax (Adaptive MinMax) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.adaptive_minmax` uses two different polynomial orders and two different +:meth:`~.Baseline.adaptive_minmax` uses two different polynomial orders and two different weighting schemes to create a total of four fits. The polynomial order(s) can be specified by the user, or else they will be estimated by the signal-to-noise ratio of the data. The first weighting scheme is either all points weighted diff --git a/docs/algorithms/polynomial.rst b/docs/algorithms/polynomial.rst index b242952..e57e706 100644 --- a/docs/algorithms/polynomial.rst +++ b/docs/algorithms/polynomial.rst @@ -43,55 +43,10 @@ include any peak regions, the masked data can be fit, and then the resulting polynomial coefficients (must set ``return_coef`` to True) can be used to create a polynomial that spans the entirety of the original dataset. -.. code-block:: python - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - x_masked = x[non_peaks] - y_masked = y[non_peaks] - - # fit only the masked x and y - _, params = poly(y_masked, x_masked, poly_order=3, return_coef=True) - # recreate the polynomial using numpy and the full x-data - baseline = np.polynomial.Polynomial(params['coef'])(x) - - fig, ax = plt.subplots(tight_layout={'pad': 0.2}) - data_handle = ax.plot(y) - baseline_handle = ax.plot(baseline, '--') - masked_y = y.copy() - masked_y[~non_peaks] = np.nan - masked_handle = ax.plot(masked_y) - ax.set_yticks([]) - ax.set_xticks([]) - ax.legend( - (data_handle[0], masked_handle[0], baseline_handle[0]), - ('data', 'non-peak regions', 'fit baseline'), frameon=False - ) - plt.show() - - .. plot:: :align: center + :context: reset + :include-source: True import numpy as np import matplotlib.pyplot as plt @@ -123,6 +78,9 @@ a polynomial that spans the entirety of the original dataset. # recreate the polynomial using numpy and the full x-data baseline = np.polynomial.Polynomial(params['coef'])(x) + # Alternatively, just use numpy: + # baseline = np.polynomial.Polynomial.fit(x_masked, y_masked, 3)(x) + fig, ax = plt.subplots(tight_layout={'pad': 0.2}) data_handle = ax.plot(y) baseline_handle = ax.plot(baseline, '--') @@ -141,80 +99,19 @@ a polynomial that spans the entirety of the original dataset. The second way is to keep the original data, and input a custom weight array into the fitting function with values equal to 0 in peak regions and 1 in baseline regions. -.. code-block:: python - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - weights = np.zeros(y.shape[0]) - weights[non_peaks] = 1 - # directly create baseline by inputting weights - baseline = poly(y, x, poly_order=3, weights=weights)[0] - - fig, ax = plt.subplots(tight_layout={'pad': 0.2}) - data_handle = ax.plot(y) - baseline_handle = ax.plot(baseline, '--') - masked_y = y.copy() - masked_y[~non_peaks] = np.nan - masked_handle = ax.plot(masked_y) - ax.set_yticks([]) - ax.set_xticks([]) - ax.legend( - (data_handle[0], masked_handle[0], baseline_handle[0]), - ('data', 'non-peak regions', 'fit baseline'), frameon=False - ) - plt.show() - - .. plot:: :align: center + :context: close-figs + :include-source: True - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - weights = np.zeros(y.shape[0]) + weights = np.zeros(len(y)) weights[non_peaks] = 1 - + # directly create baseline by inputting weights baseline = poly(y, x, poly_order=3, weights=weights)[0] + # Alternatively, just use numpy: + # baseline = np.polynomial.Polynomial.fit(x, y, 3, w=weights)(x) + fig, ax = plt.subplots(tight_layout={'pad': 0.2}) data_handle = ax.plot(y) baseline_handle = ax.plot(baseline, '--') @@ -234,7 +131,7 @@ As seen above, both ways produce the same resulting baseline, but the second way (setting weights) is much easier and faster since the baseline is directly calculated. The only algorithm in pybaselines that requires using selective masking is -:meth:`.poly`, which is normal polynomial least-squares fitting as described +:meth:`~.Baseline.poly`, which is normal polynomial least-squares fitting as described above. However, all other polynomial techniques allow inputting custom weights in order to get better fits or to reduce the number of iterations. @@ -254,25 +151,8 @@ The figure below illustrates the iterative thresholding. .. plot:: :align: center - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 6, 550, 5) - + gaussian(x, 9, 800, 10) - + gaussian(x, 9, 100, 12) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 880, 8) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise + :context: close-figs + :include-source: False fig, axes = plt.subplots( 2, 2, gridspec_kw={'hspace': 0, 'wspace': 0}, @@ -296,15 +176,15 @@ The figure below illustrates the iterative thresholding. plt.show() -The algorithms in pybaselines that use thresholding are :meth:`.modpoly`, -:meth:`.imodpoly`, and :meth:`.loess` (if ``use_threshold`` is True). +The algorithms in pybaselines that use thresholding are :meth:`~.Baseline.modpoly`, +:meth:`~.Baseline.imodpoly`, and :meth:`~.Baseline.loess` (if ``use_threshold`` is True). Penalyzing Outliers ~~~~~~~~~~~~~~~~~~~ The algorithms in pybaselines that penalyze outliers are -:meth:`.penalized_poly`, which incorporate the penalty directly into the -minimized cost function, and :meth:`.loess` (if ``use_threshold`` is False), +:meth:`~.Baseline.penalized_poly`, which incorporate the penalty directly into the +minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False), which incorporates penalties by applying lower weights to outliers. Refer to the particular algorithms below for more details. @@ -315,7 +195,7 @@ Algorithms poly (Regular Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.poly` is simple least-squares polynomial fitting. Use selective +:meth:`~.Baseline.poly` is simple least-squares polynomial fitting. Use selective masking, as described above, in order to use it for baseline fitting. Note that the plots below are just the least-squared polynomial fitting @@ -441,7 +321,7 @@ of the data since masking is time-consuming. modpoly (Modified Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.modpoly` uses thresholding, as explained above, to iteratively fit a polynomial +:meth:`~.Baseline.modpoly` uses thresholding, as explained above, to iteratively fit a polynomial baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature, and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterative Polynomial Fit". @@ -463,7 +343,7 @@ baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature, imodpoly (Improved Modified Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.imodpoly` is an attempt to improve the modpoly algorithm for noisy data, +:meth:`~.Baseline.imodpoly` is an attempt to improve the modpoly algorithm for noisy data, by including the standard deviation of the residual (data - baseline) when performing the thresholding. The number of standard deviations included in the thresholding can be adjusted by setting ``num_std``. `imodpoly` is also sometimes called "IModPolyFit" in literature, @@ -492,7 +372,7 @@ and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterati penalized_poly (Penalized Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.penalized_poly` (sometimes referred to as "backcor" in literature) fits a +:meth:`~.Baseline.penalized_poly` (sometimes referred to as "backcor" in literature) fits a polynomial baseline to data using non-quadratic cost functions. Compared to the quadratic cost function used in typical least-squares as discussed above, non-quadratic cost funtions allow outliers above a user-defined threshold to have less effect on the fit. pentalized_poly @@ -608,7 +488,7 @@ The plots below show the symmetric and asymmetric forms of the cost functions. loess (Locally Estimated Scatterplot Smoothing) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.loess` (sometimes referred to as "rbe" or "robust baseline estimate" in literature) +:meth:`~.Baseline.loess` (sometimes referred to as "rbe" or "robust baseline estimate" in literature) is similar to `traditional loess/lowess `_ but adapted for fitting the baseline. The baseline at each point is estimated by using polynomial regression on the k-nearest neighbors of the point, and the effect of outliers @@ -651,7 +531,7 @@ is reduced by iterative reweighting. quant_reg (Quantile Regression) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.quant_reg` fits a polynomial to the baseline using quantile regression. +:meth:`~.Baseline.quant_reg` fits a polynomial to the baseline using quantile regression. .. plot:: :align: center @@ -674,8 +554,8 @@ quant_reg (Quantile Regression) goldindec (Goldindec Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.goldindec` fits a polynomial baseline to data using non-quadratic cost functions, -similar to :meth:`.penalized_poly`, except that it only allows asymmetric cost functions. +:meth:`~.Baseline.goldindec` fits a polynomial baseline to data using non-quadratic cost functions, +similar to :meth:`~.Baseline.penalized_poly`, except that it only allows asymmetric cost functions. The optimal threshold value between quadratic and non-quadratic loss is iteratively optimized based on the input `peak_ratio` value. diff --git a/docs/algorithms/smooth.rst b/docs/algorithms/smooth.rst index d42e2f0..0cff0b7 100644 --- a/docs/algorithms/smooth.rst +++ b/docs/algorithms/smooth.rst @@ -5,10 +5,6 @@ Smoothing Baselines The contents of :mod:`pybaselines.smooth` contain algorithms that use smoothing to eliminate peaks and leave only the baseline. -.. note:: - The module pybaselines.smooth was named pybaselines.window until version 0.6.0. - - .. note:: The window size used for smoothing-based algorithms is index-based, rather than based on the units of the data, so proper conversions must be done @@ -21,7 +17,7 @@ Algorithms noise_median (Noise Median method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.noise_median` estimates the baseline as the median value within +:meth:`~.Baseline.noise_median` estimates the baseline as the median value within a moving window. The resulting baseline is then smoothed by convolving with a Gaussian kernel. Note that this method does not perform well for tightly-grouped peaks. @@ -149,7 +145,7 @@ kernel. Note that this method does not perform well for tightly-grouped peaks. snip (Statistics-sensitive Non-linear Iterative Peak-clipping) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.snip` iteratively takes the element-wise minimimum of each value +:meth:`~.Baseline.snip` iteratively takes the element-wise minimimum of each value and the average of the values at the left and right edge of a window centered at the value. The size of the half-window is incrementally increased from 1 to the specified maximum size, which should be set to approximately half of the @@ -195,7 +191,7 @@ data. The baselines when using decreasing window size and smoothing is shown bel swima (Small-Window Moving Average) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.swima` iteratively takes the element-wise minimum of either the +:meth:`~.Baseline.swima` iteratively takes the element-wise minimum of either the data (first iteration) or the previous iteration's baseline and the data/previous baseline smoothed with a moving average. The window used for the moving average smoothing is incrementally increased to smooth peaks until convergence is reached. @@ -219,7 +215,7 @@ incrementally increased to smooth peaks until convergence is reached. ipsa (Iterative Polynomial Smoothing Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.ipsa` iteratively smooths the input data using a second-order +:meth:`~.Baseline.ipsa` iteratively smooths the input data using a second-order Savitzky–Golay filter until the exit criteria is reached. .. plot:: @@ -240,7 +236,7 @@ Savitzky–Golay filter until the exit criteria is reached. ria (Range Independent Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.ria` first extrapolates a linear baseline from the left and/or +:meth:`~.Baseline.ria` first extrapolates a linear baseline from the left and/or right edges of the data and adds Gaussian peaks to these baselines, similar to the :ref:`optimize_extended_range ` function, and records their initial areas. The data is then iteratively smoothed using a diff --git a/docs/algorithms/spline.rst b/docs/algorithms/spline.rst index a9ed8b1..560f88f 100644 --- a/docs/algorithms/spline.rst +++ b/docs/algorithms/spline.rst @@ -66,7 +66,7 @@ Algorithms mixture_model (Mixture Model) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mixture_model` considers the data as a mixture model composed of +:meth:`~.Baseline.mixture_model` considers the data as a mixture model composed of a baseline with noise and peaks. The weighting for the penalized spline fitting the baseline is iteratively determined by fitting the residual with a normal distribution centered at 0 (representing the noise), and a uniform distribution @@ -203,7 +203,7 @@ residual belonging to the noise's normal distribution. irsqr (Iterative Reweighted Spline Quantile Regression) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.irsqr` uses penalized splines and iterative reweighted least squares +:meth:`~.Baseline.irsqr` uses penalized splines and iterative reweighted least squares to perform quantile regression on the data. .. plot:: @@ -227,7 +227,7 @@ to perform quantile regression on the data. corner_cutting (Corner-Cutting Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.corner_cutting` iteratively removes corner points and then creates +:meth:`~.Baseline.corner_cutting` iteratively removes corner points and then creates a quadratic Bezier spline from the remaining points. Continuity between the individual Bezier curves is maintained by adding control points halfway between all but the first and last non-corner points. @@ -253,7 +253,7 @@ between all but the first and last non-corner points. pspline_asls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_asls` is a penalized spline version of :meth:`.asls`. +:meth:`~.Baseline.pspline_asls` is a penalized spline version of :meth:`~.Baseline.asls`. Minimized function: @@ -301,7 +301,7 @@ Weighting: pspline_iasls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_iasls` is a penalized spline version of :meth:`.iasls`. +:meth:`~.Baseline.pspline_iasls` is a penalized spline version of :meth:`~.Baseline.iasls`. Minimized function: @@ -354,7 +354,7 @@ Weighting: pspline_airpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_airpls` is a penalized spline version of :meth:`.airpls`. +:meth:`~.Baseline.pspline_airpls` is a penalized spline version of :meth:`~.Baseline.airpls`. Minimized function: @@ -401,7 +401,7 @@ values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - z pspline_arpls (Penalized Spline Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_arpls` is a penalized spline version of :meth:`.arpls`. +:meth:`~.Baseline.pspline_arpls` is a penalized spline version of :meth:`~.Baseline.arpls`. Minimized function: @@ -445,7 +445,7 @@ values in the residual vector :math:`\mathbf r`. pspline_drpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_drpls` is a penalized spline version of :meth:`.drpls`. +:meth:`~.Baseline.pspline_drpls` is a penalized spline version of :meth:`~.Baseline.drpls`. Minimized function: @@ -501,7 +501,7 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`. pspline_iarpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_iarpls` is a penalized spline version of :meth:`.iarpls`. +:meth:`~.Baseline.pspline_iarpls` is a penalized spline version of :meth:`~.Baseline.iarpls`. Minimized function: @@ -549,7 +549,7 @@ the residual vector :math:`\mathbf r`. pspline_aspls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_aspls` is a penalized spline version of :meth:`.aspls`. +:meth:`~.Baseline.pspline_aspls` is a penalized spline version of :meth:`~.Baseline.aspls`. Minimized function: @@ -614,7 +614,7 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better). pspline_psalsa (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_psalsa` is a penalized spline version of :meth:`.psalsa`. +:meth:`~.Baseline.pspline_psalsa` is a penalized spline version of :meth:`~.Baseline.psalsa`. Minimized function: @@ -661,7 +661,7 @@ be considered a peak. pspline_derpsalsa (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_derpsalsa` is a penalized spline version of :meth:`.derpsalsa`. +:meth:`~.Baseline.pspline_derpsalsa` is a penalized spline version of :meth:`~.Baseline.derpsalsa`. Minimized function: @@ -723,7 +723,7 @@ respectively, of the smoothed data, :math:`y_{sm}`, and :math:`rms()` is the roo pspline_mpls (Penalized Spline Morphological Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_mpls` is a penalized spline version of :meth:`.mpls`. +:meth:`~.Baseline.pspline_mpls` is a penalized spline version of :meth:`~.Baseline.mpls`. Minimized function: diff --git a/docs/algorithms/whittaker.rst b/docs/algorithms/whittaker.rst index cd3e821..2ef9096 100644 --- a/docs/algorithms/whittaker.rst +++ b/docs/algorithms/whittaker.rst @@ -11,8 +11,8 @@ Introduction Whittaker-smoothing-based (WSB) algorithms are usually referred to in literature as weighted least squares, penalized least squares, or asymmetric least squares, but are referred to as WSB in pybaselines to distinguish them from polynomial -techniques that also take advantage of weighted least squares (like :meth:`.loess`) -and penalized least squares (like :meth:`.penalized_poly`). +techniques that also take advantage of weighted least squares (like :meth:`~.Baseline.loess`) +and penalized least squares (like :meth:`~.Baseline.penalized_poly`). The general idea behind WSB algorithms is to make the baseline match the measured data as well as it can while also penalizing the roughness of the baseline. The @@ -80,7 +80,7 @@ Algorithms asls (Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The :meth:`.asls` (sometimes called "ALS" in literature) function is the +The :meth:`~.Baseline.asls` (sometimes called "ALS" in literature) function is the original implementation of Whittaker smoothing for baseline fitting. Minimized function: @@ -230,7 +230,7 @@ Weighting: iasls (Improved Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.iasls` is an attempt to improve the asls algorithm by considering +:meth:`~.Baseline.iasls` is an attempt to improve the asls algorithm by considering both the roughness of the baseline and the first derivative of the residual (data - baseline). @@ -285,7 +285,7 @@ Weighting: airpls (Adaptive Iteratively Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.airpls` uses an exponential weighting of the negative residuals to +:meth:`~.Baseline.airpls` uses an exponential weighting of the negative residuals to attempt to provide a better fit than the asls method. Minimized function: @@ -326,7 +326,7 @@ values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - z arpls (Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.arpls` uses a single weighting function that is designed to account +:meth:`~.Baseline.arpls` uses a single weighting function that is designed to account for noisy data. Minimized function: @@ -369,7 +369,7 @@ deviation, respectively, of the negative values in the residual vector :math:`\m drpls (Doubly Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.drpls` uses a single weighting function that is designed to account +:meth:`~.Baseline.drpls` uses a single weighting function that is designed to account for noisy data, similar to arpls. Further, it takes into account both the first and second derivatives of the baseline and uses a parameter :math:`\eta` to adjust the fit in peak versus non-peak regions. @@ -426,7 +426,7 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`. iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.iarpls` is an attempt to improve the arpls method, which has a tendency +:meth:`~.Baseline.iarpls` is an attempt to improve the arpls method, which has a tendency to overestimate the baseline when fitting small peaks in noisy data, by using an adjusted weighting formula. @@ -471,7 +471,7 @@ the residual vector :math:`\mathbf r`. aspls (Adaptive Smoothness Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.aspls`, similar to the iarpls method, is an attempt to improve the arpls method, +:meth:`~.Baseline.aspls`, similar to the iarpls method, is an attempt to improve the arpls method, which it does by using an adjusted weighting function and an additional parameter :math:`\alpha`. Minimized function: @@ -527,7 +527,7 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better). psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.psalsa` is an attempt at improving the asls method to better fit noisy data +:meth:`~.Baseline.psalsa` is an attempt at improving the asls method to better fit noisy data by using an exponential decaying weighting for positive residuals. Minimized function: @@ -573,7 +573,7 @@ be considered a peak. derpsalsa (Derivative Peak-Screening Asymmetric Least Squares Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.derpsalsa` is an attempt at improving the asls method to better fit noisy data +:meth:`~.Baseline.derpsalsa` is an attempt at improving the asls method to better fit noisy data by using an exponential decaying weighting for positive residuals. Further, it calculates additional weights based on the first and second derivatives of the data. diff --git a/docs/conf.py b/docs/conf.py index 3803137..6fad0af 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -36,7 +36,8 @@ #'sphinx.ext.autosummary', 'autoapi.extension', 'sphinx.ext.intersphinx', - 'sphinx.ext.napoleon', + #'sphinx.ext.napoleon', + 'numpydoc', 'sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', @@ -136,15 +137,28 @@ #'special-members', # show things like __str__ #'imported-members', # document things imported within each module ] -autoapi_member_order = 'groupwise' # groups into classes, functions, etc. -autoapi_python_class_content = 'class' # include class docstring from class and/or __init__ -#autoapi_keep_files = True # keep the files after generation -#autoapi_add_toctree_entry = False # need to manually add to toctree if False -#autoapi_generate_api_docs = False # will not generate new docs when False +autoapi_member_order = 'groupwise' # groups into classes, functions, etc. +autoapi_python_class_content = 'class' # include class docstring from class and/or __init__ +autoapi_keep_files = False # keep the files after generation +autoapi_add_toctree_entry = True # need to manually add to toctree if False +autoapi_generate_api_docs = True # will not generate new docs when False # ignore an import warning from sphinx-autoapi due to double import of utils -suppress_warnings = ['autoapi.python_import_resolution'] +suppress_warnings = ['autoapi.python_import_resolution', 'autosectionlabel'] +# -- Settings for matplotlib plot_directive extension ---------------------------- + +plot_include_source = False + +plot_formats = ['png'] + +# -- Settings for numpydoc extension ---------------------------- + +# uses the matplotlib plot_directive extension when "import matplotlib" is in a docstring +numpydoc_use_plots = True + +# creates cross references for types in docstrings +numpydoc_xref_param_type = False # -- Settings for sphinx-gallery extension ---------------------------- @@ -196,7 +210,7 @@ html_theme = 'nature' else: html_theme = 'sphinx_rtd_theme' - del sphinx_rtd_theme + # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -213,7 +227,6 @@ #'_static' ] - # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. diff --git a/docs/installation.rst b/docs/installation.rst index 0d4573e..9a67766 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -25,13 +25,13 @@ pybaselines has the following optional dependencies: * `numba `_ (>= 0.45): speeds up calculations used by the following functions: - * :meth:`.loess` - * :meth:`.dietrich` - * :meth:`.golotvin` - * :meth:`.std_distribution` - * :meth:`.fastchrom` - * :meth:`.beads` - * :meth:`.mpspline` + * :meth:`~Baseline.loess` + * :meth:`~Baseline.dietrich` + * :meth:`~Baseline.golotvin` + * :meth:`~Baseline.std_distribution` + * :meth:`~Baseline.fastchrom` + * :meth:`~Baseline.beads` + * :meth:`~Baseline.mpspline` * all functions in :mod:`pybaselines.spline` * `pentapy `_ (>= 1.0): @@ -39,9 +39,9 @@ pybaselines has the following optional dependencies: used by the following functions (when ``diff_order=2``): * all functions in :mod:`pybaselines.whittaker` - * :meth:`.mpls` - * :meth:`.jbcd` - * :meth:`.fabc` + * :meth:`~Baseline.mpls` + * :meth:`~Baseline.jbcd` + * :meth:`~Baseline.fabc` Stable Release diff --git a/examples/classification/plot_classifier_masks.py b/examples/classification/plot_classifier_masks.py index f0d8a18..852bf5d 100644 --- a/examples/classification/plot_classifier_masks.py +++ b/examples/classification/plot_classifier_masks.py @@ -3,7 +3,7 @@ Classification masks -------------------- -The baseline algorithms in the :mod:`.classification` module estimate the baseline +The baseline algorithms in the :mod:`~pybaselines.classification` module estimate the baseline by classifying each point as belonging to either the baseline or the peaks. When first using a function, the correct parameters may not be known. To make the effects of input parameters on the classification process more easily understood, all functions diff --git a/examples/classification/plot_fastchrom_threshold.py b/examples/classification/plot_fastchrom_threshold.py index ef321e4..a1fc853 100644 --- a/examples/classification/plot_fastchrom_threshold.py +++ b/examples/classification/plot_fastchrom_threshold.py @@ -3,7 +3,7 @@ fastchrom threshold ------------------- -:meth:`.fastchrom` classifies baseline points based on their rolling standard +:meth:`~.Baseline.fastchrom` classifies baseline points based on their rolling standard deviation value. The default threshold for fastchrom is set to the fifteenth percentile of the rolling standard deviation distribution. This default is rather conservative in assigning diff --git a/examples/general/plot_algorithm_convergence.py b/examples/general/plot_algorithm_convergence.py index 0fcd575..b4557a8 100644 --- a/examples/general/plot_algorithm_convergence.py +++ b/examples/general/plot_algorithm_convergence.py @@ -9,7 +9,7 @@ the measured tolerance value at each iteration. The `tol_history` parameter can be helpful for determining appropriate `max_iter` or `tol` values. -In this example, the convergence of the :meth:`.asls` and :meth:`.aspls` functions +In this example, the convergence of the :meth:`~.Baseline.asls` and :meth:`~.Baseline.aspls` functions will be compared. asls is a relatively simple calculation that sets its weighting each iteration based on whether the current baseline is above or below the input data at each point. aspls has a much more intricate weighting based on the logistic distribution diff --git a/examples/general/plot_noisy_data.py b/examples/general/plot_noisy_data.py index d2012de..7557e35 100644 --- a/examples/general/plot_noisy_data.py +++ b/examples/general/plot_noisy_data.py @@ -8,8 +8,8 @@ This example will show how to reduce this issue by simply smoothing the data before performing baseline correction. -Two algorithms will be compared: :meth:`.modpoly`, which is not suited for noisy -data, and :meth:`.imodpoly`, which is a modification of the modpoly algorithm +Two algorithms will be compared: :meth:`~.Baseline.modpoly`, which is not suited for noisy +data, and :meth:`~.Baseline.imodpoly`, which is a modification of the modpoly algorithm created specifically to address noise. """ diff --git a/examples/misc/plot_beads_preprocessing.py b/examples/misc/plot_beads_preprocessing.py index 7c63200..4f3343b 100644 --- a/examples/misc/plot_beads_preprocessing.py +++ b/examples/misc/plot_beads_preprocessing.py @@ -3,7 +3,7 @@ Preprocessing for beads ----------------------- -The Baseline Estimation And Denoising with Sparsity (:meth:`.beads`) algorithm is a +The Baseline Estimation And Denoising with Sparsity (:meth:`~.Baseline.beads`) algorithm is a robust method for both performing baseline subtraction and removing noise. One of the main drawbacks of the original algorithm is that it requires that both ends of the data to be at zero. This example will explore the consequences of this as diff --git a/examples/morphological/plot_half_window_effects.py b/examples/morphological/plot_half_window_effects.py index 4702153..e301ace 100644 --- a/examples/morphological/plot_half_window_effects.py +++ b/examples/morphological/plot_half_window_effects.py @@ -6,7 +6,7 @@ This example shows the influence of the `half_window` parameter that is used when fitting any morphological algorithm. -For this example, the :meth:`.mor` algorithm will be used, which is a relatively +For this example, the :meth:`~.Baseline.mor` algorithm will be used, which is a relatively robust baseline algorithm. """ diff --git a/examples/spline/plot_lam_vs_num_knots.py b/examples/spline/plot_lam_vs_num_knots.py index fe89965..525ce57 100644 --- a/examples/spline/plot_lam_vs_num_knots.py +++ b/examples/spline/plot_lam_vs_num_knots.py @@ -5,7 +5,7 @@ This example will examine the effects of `lam` for fitting a penalized spline baseline while varying both the number of knots for the spline, `num_knots`, and the number of -data points. The function :meth:`.mixture_model` is used for all calculations. +data points. The function :meth:`~.Baseline.mixture_model` is used for all calculations. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/spline/plot_pspline_whittaker.py b/examples/spline/plot_pspline_whittaker.py index e1dd6b9..3c78c0d 100644 --- a/examples/spline/plot_pspline_whittaker.py +++ b/examples/spline/plot_pspline_whittaker.py @@ -8,8 +8,8 @@ for doing so was that P-splines offer additional user flexibility when choosing parameters for fitting and more easily work for unequally spaced data. This example will examine the relationship of `lam` versus the number of data points when fitting -a baseline with the :meth:`.arpls` function and its P-spline version, -:meth:`.pspline_arpls`. +a baseline with the :meth:`~.Baseline.arpls` function and its P-spline version, +:meth:`~.Baseline.pspline_arpls`. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/whittaker/plot_lam_effects.py b/examples/whittaker/plot_lam_effects.py index e19f92f..6a13d1b 100644 --- a/examples/whittaker/plot_lam_effects.py +++ b/examples/whittaker/plot_lam_effects.py @@ -8,7 +8,7 @@ exact `lam` values used in this example are unimportant, just the changes in their scale. -For this example, the :meth:`.arpls` algorithm will be used, which performs +For this example, the :meth:`~.Baseline.arpls` algorithm will be used, which performs well in the presence of noise. """ diff --git a/examples/whittaker/plot_lam_vs_data_size.py b/examples/whittaker/plot_lam_vs_data_size.py index 1ac2668..3d2eaba 100644 --- a/examples/whittaker/plot_lam_vs_data_size.py +++ b/examples/whittaker/plot_lam_vs_data_size.py @@ -9,7 +9,7 @@ Whittaker-smoothing-based algorithm is dependent on the number of data points. Thus, this can cause issues when adapting an algorithm to a new set of data since the published optimal `lam` value is not universal. This example shows an analysis of this dependence -for all available functions in the :mod:`.whittaker` module. +for all available functions in the :mod:`pybaselines.whittaker` module. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/whittaker/plot_whittaker_solvers.py b/examples/whittaker/plot_whittaker_solvers.py index a732f06..65fa06a 100644 --- a/examples/whittaker/plot_whittaker_solvers.py +++ b/examples/whittaker/plot_whittaker_solvers.py @@ -7,7 +7,7 @@ the banded structure of the linear system to reduce the computation time. This example shows the difference in computation times of the asymmetic least squares -(:meth:`.asls`) algorithm when using the banded solver from Scipy (solveh_banded) +(:meth:`~.Baseline.asls`) algorithm when using the banded solver from Scipy (solveh_banded) and the banded solver from the optional dependency `pentapy `_. In addition, the time it takes when solving the system using sparse matrices rather than the banded matrices diff --git a/pybaselines/__init__.py b/pybaselines/__init__.py index c8448ac..95035a3 100644 --- a/pybaselines/__init__.py +++ b/pybaselines/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """ +======================================================================================= pybaselines - A library of algorithms for the baseline correction of experimental data. ======================================================================================= diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 5d7ee17..7efe658 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -44,10 +44,11 @@ class _Algorithm: that no polynomial fitting has been performed. pspline : PSpline or None The PSpline object for setting up and solving penalized spline algorithms. Is None - if no penalized spline setup has been performed (typically done in :meth:`._setup_spline`). + if no penalized spline setup has been performed (typically done in + :meth:`~_Algorithm._setup_spline`). vandermonde : numpy.ndarray or None The Vandermonde matrix for solving polynomial equations. Is None if no polynomial - setup has been performed (typically done in :meth:`._setup_polynomial`). + setup has been performed (typically done in :meth:`~_Algorithm._setup_polynomial`). whittaker_system : PenalizedSystem or None The PenalizedSystem object for setting up and solving Whittaker-smoothing-based algorithms. Is None if no Whittaker setup has been performed (typically done in @@ -337,7 +338,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. lam : float, optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method @@ -411,7 +412,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -487,7 +488,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -570,7 +571,7 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. half_window : int, optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the @@ -625,7 +626,7 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. half_window : int, optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge @@ -654,7 +655,7 @@ def _setup_classification(self, y, weights=None): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -741,7 +742,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. method : str The string name of the desired function, like 'asls'. Case does not matter. modules : Sequence(module, ...) @@ -793,7 +794,7 @@ def _setup_misc(self, y): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. Returns ------- diff --git a/pybaselines/_banded_utils.py b/pybaselines/_banded_utils.py index d34f7df..74520fe 100644 --- a/pybaselines/_banded_utils.py +++ b/pybaselines/_banded_utils.py @@ -549,11 +549,12 @@ class PenalizedSystem: Maintained so that repeated computations with different `lam` values can be quickly set up. `original_diagonals` can be either the full or lower bands of the penalty, and may be reveresed, it depends on the set up. Reset by calling - :meth:`.reset_diagonals`. + :meth:`~PenalizedSystem.reset_diagonals`. penalty : numpy.ndarray The current penalty. Originally is `original_diagonals` after multiplying by `lam` - and applying padding, but can also be changed by calling :meth:`.add_penalty`. - Reset by calling :meth:`.reset_diagonals`. + and applying padding, but can also be changed by calling + :meth:`~PenalizedSystem.add_penalty`. + Reset by calling :meth:`~PenalizedSystem.reset_diagonals`. pentapy_solver : int or str The integer or string designating which solver to use if using pentapy. See :func:`pentapy.solve` for available options, although `1` or `2` are the diff --git a/pybaselines/_spline_utils.py b/pybaselines/_spline_utils.py index 32a2efb..bfe3785 100644 --- a/pybaselines/_spline_utils.py +++ b/pybaselines/_spline_utils.py @@ -646,7 +646,7 @@ class PSpline(PenalizedSystem): in `x`, and `M` is the number of basis functions (equal to ``K - spline_degree - 1`` or equivalently ``num_knots + spline_degree - 1``). coef : None or numpy.ndarray, shape (M,) - The spline coefficients. Is None if :meth:`.solve_pspline` has not been called + The spline coefficients. Is None if :meth:`~PSpline.solve_pspline` has not been called at least once. knots : numpy.ndarray, shape (K,) The knots for the spline. Has a shape of `K`, which is equal to diff --git a/pybaselines/classification.py b/pybaselines/classification.py index 9bcc4b7..f516fd5 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -764,7 +764,7 @@ def fabc(self, data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length= Notes ----- - The classification of baseline points is similar to :meth:`.dietrich`, except that + The classification of baseline points is similar to :meth:`~Baseline.dietrich`, except that this method approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust than the numerical derivative in Dietrich's method. @@ -1809,7 +1809,7 @@ def fabc(data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length=2, wei Notes ----- - The classification of baseline points is similar to :meth:`.dietrich`, except that + The classification of baseline points is similar to :meth:`~Baseline.dietrich`, except that this method approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust than the numerical derivative in Dietrich's method. diff --git a/pybaselines/misc.py b/pybaselines/misc.py index f8faeb9..8c23d28 100644 --- a/pybaselines/misc.py +++ b/pybaselines/misc.py @@ -1163,7 +1163,7 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. Decomposes the input data into baseline and pure, noise-free signal by modeling the baseline as a low pass filter and by considering the signal and its derivatives - as sparse [1]_. + as sparse [4]_. Parameters ---------- @@ -1207,14 +1207,14 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. derivatives are close to zero. Default is 1e-6. fit_parabola : bool, optional If True (default), will fit a parabola to the data and subtract it before - performing the beads fit as suggested in [2]_. This ensures the endpoints of + performing the beads fit as suggested in [5]_. This ensures the endpoints of the fit data are close to 0, which is required by beads. If the data is already close to 0 on both endpoints, set `fit_parabola` to False. smooth_half_window : int, optional The half-window to use for smoothing the derivatives of the data with a moving average and full window size of `2 * smooth_half_window + 1`. Smoothing can improve the convergence of the calculation, and make the calculation less sensitive - to small changes in `lam_1` and `lam_2`, as noted in the pybeads package [3]_. + to small changes in `lam_1` and `lam_2`, as noted in the pybeads package [6]_. Default is None, which will not perform any smoothing. x_data : array-like, optional The x-values. Not used by this function, but input is allowed for consistency @@ -1243,7 +1243,7 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. When finding the best parameters for fitting, it is usually best to find the optimal `freq_cutoff` for the noise in the data before adjusting any other parameters since - it has the largest effect [2]_. + it has the largest effect [5]_. Raises ------ @@ -1252,10 +1252,10 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. References ---------- - .. [1] Ning, X., et al. Chromatogram baseline estimation and denoising using sparsity + .. [4] Ning, X., et al. Chromatogram baseline estimation and denoising using sparsity (BEADS). Chemometrics and Intelligent Laboratory Systems, 2014, 139, 156-167. - .. [2] Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms + .. [5] Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms using the BEADS algorithm. Journal of Chromatography A, 2017, 1507, 1-10. - .. [3] https://github.com/skotaro/pybeads. + .. [6] https://github.com/skotaro/pybeads. """ diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index 17d038f..c09a943 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -830,7 +830,7 @@ def jbcd(self, data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult= robust_opening : bool, optional If True (default), the opening used to represent the initial baseline is the element-wise minimum between the morphological opening and the average of the - morphological erosion and dilation of the opening, similar to :meth:`.mor`. If + morphological erosion and dilation of the opening, similar to :meth:`~Baseline.mor`. If False, the opening is just the morphological opening, as used in the reference. The robust opening typically represents the baseline better. **window_kwargs @@ -1626,7 +1626,7 @@ def jbcd(data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult=1.1, g robust_opening : bool, optional If True (default), the opening used to represent the initial baseline is the element-wise minimum between the morphological opening and the average of the - morphological erosion and dilation of the opening, similar to :meth:`.mor`. If + morphological erosion and dilation of the opening, similar to :meth:`~Baseline.mor`. If False, the opening is just the morphological opening, as used in the reference. The robust opening typically represents the baseline better. x_data : array-like, optional diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index c7d51c4..a11f30f 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -400,8 +400,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, Default is 2. method_kwargs : dict, optional Additional keyword arguments to pass to - :meth:`~pybaselines.polynomial.Polynomial.modpoly` or - :meth:`~pybaselines.polynomial.Polynomial.imodpoly`. These include + :meth:`~Baseline.modpoly` or :meth:`~Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns @@ -677,7 +676,7 @@ def collab_pls(data, average_dataset=True, method='asls', method_kwargs=None, x_ * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the - :meth:`.aspls` or :meth:`.pspline_aspls` methods. + :meth:`~Baseline.aspls` or :meth:`~Baseline.pspline_aspls` methods. Additional items depend on the output of the selected method. Every other key will have a list of values, with each item corresponding to a @@ -897,8 +896,8 @@ def adaptive_minmax(data, x_data=None, poly_order=None, method='modpoly', to select the appropriate polynomial orders if `poly_order` is None. Default is 2. method_kwargs : dict, optional - Additional keyword arguments to pass to :meth:`.modpoly` or - :meth:`.imodpoly`. These include `tol`, `max_iter`, `use_original`, + Additional keyword arguments to pass to :meth:`~Baseline.modpoly` or + :meth:`~Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 2359419..104c8f3 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -1084,7 +1084,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -1180,7 +1180,7 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -2384,7 +2384,7 @@ def pspline_psalsa(data, lam=1e3, p=0.5, k=None, num_knots=100, spline_degree=3, values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -2463,7 +2463,7 @@ def pspline_derpsalsa(data, lam=1e2, p=1e-2, k=None, num_knots=100, spline_degre values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional diff --git a/pybaselines/two_d/__init__.py b/pybaselines/two_d/__init__.py index 856044f..fe19827 100644 --- a/pybaselines/two_d/__init__.py +++ b/pybaselines/two_d/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """ +============================================= Baseline Correction for Two Dimensional Data. ============================================= @@ -30,7 +31,6 @@ * mor (Morphological) * imor (Improved Morphological) * rolling_ball (Rolling Ball Baseline) - * mwmv (Moving Window Minimum Value) * tophat (Top-hat Transformation) * Spline methods (:mod:`pybaselines.two_d.spline`) @@ -41,19 +41,13 @@ * pspline_iasls (Penalized Spline Version of iasls) * pspline_airpls (Penalized Spline Version of airpls) * pspline_arpls (Penalized Spline Version of arpls) - * pspline_drpls (Penalized Spline Version of drpls) * pspline_iarpls (Penalized Spline Version of iarpls) - * pspline_aspls (Penalized Spline Version of aspls) * pspline_psalsa (Penalized Spline Version of psalsa) * Smoothing-based methods (:mod:`pybaselines.two_d.smooth`) * noise_median (Noise Median method) -* Baseline/Peak Classification methods (:mod:`pybaselines.two_d.classification`) - - * None yet - * Optimizers (:mod:`pybaselines.two_d.optimizers`) * collab_pls (Collaborative Penalized Least Squares) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 916b260..ee65509 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -39,10 +39,11 @@ class _Algorithm2D: that no polynomial fitting has been performed. pspline : PSpline2D or None The PSpline2D object for setting up and solving penalized spline algorithms. Is None - if no penalized spline setup has been performed (typically done in :meth:`._setup_spline`). + if no penalized spline setup has been performed (typically done in + :meth:`~_Algorithm2D._setup_spline`). vandermonde : numpy.ndarray or None The Vandermonde matrix for solving polynomial equations. Is None if no polynomial - setup has been performed (typically done in :meth:`._setup_polynomial`). + setup has been performed (typically done in :meth:`~_Algorithm2D._setup_polynomial`). whittaker_system : PenalizedSystem2D or None The PenalizedSystem2D object for setting up and solving Whittaker-smoothing-based algorithms. Is None if no Whittaker setup has been performed (typically done in @@ -371,7 +372,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. lam : float, optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method @@ -449,7 +450,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -561,7 +562,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -641,7 +642,7 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. half_window : int, optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the @@ -696,7 +697,7 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, hw_multiplier=2, **pa ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. half_window : int, optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge @@ -734,7 +735,7 @@ def _setup_classification(self, y, weights=None): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -835,7 +836,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. method : str The string name of the desired function, like 'asls'. Case does not matter. modules : Sequence(module, ...) @@ -890,7 +891,7 @@ def _setup_misc(self, y): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm2D._register`. Returns ------- diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index f1816e8..459f88b 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -36,7 +36,7 @@ class PSpline2D(PenalizedSystem2D): in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` or equivalently ``num_knots[1] + spline_degree[1] - 1``). coef : None or numpy.ndarray, shape (M,) - The spline coefficients. Is None if :meth:`.solve_pspline` has not been called + The spline coefficients. Is None if :meth:`~PSpline2D.solve_pspline` has not been called at least once. knots_x : numpy.ndarray, shape (K,) The knots for the spline. Has a shape of `K`, which is equal to diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 0438e8a..64c03a1 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -85,11 +85,12 @@ class PenalizedSystem2D: Maintained so that repeated computations with different `lam` values can be quickly set up. `original_diagonals` can be either the full or lower bands of the penalty, and may be reveresed, it depends on the set up. Reset by calling - :meth:`.reset_diagonals`. + :meth:`~PenalizedSystem2D.reset_diagonals`. penalty : scipy.sparse.base.spmatrix The current penalty. Originally is `original_diagonals` after multiplying by `lam` - and applying padding, but can also be changed by calling :meth:`.add_penalty`. - Reset by calling :meth:`.reset_diagonals`. + and applying padding, but can also be changed by calling + :meth:`~PenalizedSystem2D.add_penalty`. Reset by calling + :meth:`~PenalizedSystem2D.reset_diagonals`. Notes ----- diff --git a/pybaselines/two_d/classification.py b/pybaselines/two_d/classification.py deleted file mode 100644 index ad45109..0000000 --- a/pybaselines/two_d/classification.py +++ /dev/null @@ -1,13 +0,0 @@ -# -*- coding: utf-8 -*- -"""Techniques that rely on classifying peak and/or baseline segments for fitting baselines. - -Created on January 14, 2024 -@author: Donald Erb - -""" - -from ._algorithm_setup import _Algorithm2D - - -class _Classification(_Algorithm2D): - """A base class for all classification algorithms.""" diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py index 43b1330..9d7aa5e 100644 --- a/pybaselines/two_d/optimizers.py +++ b/pybaselines/two_d/optimizers.py @@ -13,7 +13,7 @@ import numpy as np -from . import classification, morphological, polynomial, spline, whittaker +from . import morphological, polynomial, spline, whittaker from ._algorithm_setup import _Algorithm2D from .._validation import _check_optional_array, _get_row_col_values from ..utils import _check_scalar, _sort_array2d @@ -78,7 +78,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No """ dataset, baseline_func, _, method_kws, _ = self._setup_optimizer( - data, method, (whittaker, morphological, classification, spline), method_kwargs, + data, method, (whittaker, morphological, spline), method_kwargs, True ) data_shape = dataset.shape @@ -151,7 +151,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, The two polynomial orders to use for fitting. If a single integer is given, then will use the input value and one plus the input value. Default is None, which will do a preliminary fit using a polynomial of order `estimation_poly_order` - and then select the appropriate polynomial orders according to [7]_. + and then select the appropriate polynomial orders according to [32]_. method : {'modpoly', 'imodpoly'}, optional The method to use for fitting each polynomial. Default is 'modpoly'. weights : array-like, shape (N,), optional @@ -174,8 +174,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, Default is 2. method_kwargs : dict, optional Additional keyword arguments to pass to - :meth:`~pybaselines.polynomial.Polynomial.modpoly` or - :meth:`~pybaselines.polynomial.Polynomial.imodpoly`. These include + :meth:`~Baseline.modpoly` or :meth:`~Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns @@ -194,7 +193,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, References ---------- - .. [7] Cao, A., et al. A robust method for automated background subtraction + .. [32] Cao, A., et al. A robust method for automated background subtraction of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index c999fa7..5b9319b 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -171,11 +171,11 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, size equal to N and all values set to 1. use_original : bool, optional If False (default), will compare the baseline of each iteration with - the y-values of that iteration [8]_ when choosing minimum values. If True, - will compare the baseline with the original y-values given by `data` [9]_. + the y-values of that iteration [33]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [34]_. mask_initial_peaks : bool, optional If True, will mask any data where the initial baseline fit + the standard - deviation of the residual is less than measured data [10]_. Default is False. + deviation of the residual is less than measured data [35]_. Default is False. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. @@ -206,17 +206,17 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Notes ----- - Algorithm originally developed in [9]_ and then slightly modified in [8]_. + Algorithm originally developed in [34]_ and then slightly modified in [33]_. References ---------- - .. [8] Gan, F., et al. Baseline correction by improved iterative polynomial + .. [33] Gan, F., et al. Baseline correction by improved iterative polynomial fitting with automatic threshold. Chemometrics and Intelligent Laboratory Systems, 2006, 82, 59-65. - .. [9] Lieber, C., et al. Automated method for subtraction of fluorescence + .. [34] Lieber, C., et al. Automated method for subtraction of fluorescence from biological raman spectra. Applied Spectroscopy, 2003, 57(11), 1363-1367. - .. [10] Zhao, J., et al. Automated Autofluorescence Background Subtraction + .. [35] Zhao, J., et al. Automated Autofluorescence Background Subtraction Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, 2007, 61(11), 1225-1232. @@ -280,11 +280,11 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, size equal to N and all values set to 1. use_original : bool, optional If False (default), will compare the baseline of each iteration with - the y-values of that iteration [11]_ when choosing minimum values. If True, - will compare the baseline with the original y-values given by `data` [12]_. + the y-values of that iteration [36]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [37]_. mask_initial_peaks : bool, optional If True (default), will mask any data where the initial baseline fit + - the standard deviation of the residual is less than measured data [13]_. + the standard deviation of the residual is less than measured data [38]_. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to a form that fits the input x_data and return them in the params dictionary. @@ -323,17 +323,17 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Notes ----- - Algorithm originally developed in [13]_. + Algorithm originally developed in [38]_. References ---------- - .. [11] Gan, F., et al. Baseline correction by improved iterative polynomial + .. [36] Gan, F., et al. Baseline correction by improved iterative polynomial fitting with automatic threshold. Chemometrics and Intelligent Laboratory Systems, 2006, 82, 59-65. - .. [12] Lieber, C., et al. Automated method for subtraction of fluorescence + .. [37] Lieber, C., et al. Automated method for subtraction of fluorescence from biological raman spectra. Applied Spectroscopy, 2003, 57(11), 1363-1367. - .. [13] Zhao, J., et al. Automated Autofluorescence Background Subtraction + .. [38] Zhao, J., et al. Automated Autofluorescence Background Subtraction Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, 2007, 61(11), 1225-1232. @@ -412,12 +412,12 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. Available methods, and their associated reference, are: - * 'asymmetric_truncated_quadratic'[14]_ - * 'symmetric_truncated_quadratic'[14]_ - * 'asymmetric_huber'[14]_ - * 'symmetric_huber'[14]_ - * 'asymmetric_indec'[15]_ - * 'symmetric_indec'[15]_ + * 'asymmetric_truncated_quadratic'[39]_ + * 'symmetric_truncated_quadratic'[39]_ + * 'asymmetric_huber'[39]_ + * 'symmetric_huber'[39]_ + * 'asymmetric_indec'[40]_ + * 'symmetric_indec'[40]_ threshold : float, optional The threshold value for the loss method, where the function goes from @@ -469,10 +469,10 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non References ---------- - .. [14] Mazet, V., et al. Background removal from spectra by designing and + .. [39] Mazet, V., et al. Background removal from spectra by designing and minimising a non-quadratic cost function. Chemometrics and Intelligent Laboratory Systems, 2005, 76(2), 121-133. - .. [15] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + .. [40] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline Correction. Applied Spectroscopy, 2015, 69(7), 834-842. """ @@ -582,17 +582,17 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Notes ----- - Application of quantile regression for baseline fitting ss described in [23]_. + Application of quantile regression for baseline fitting ss described in [41]_. Performs quantile regression using iteratively reweighted least squares (IRLS) - as described in [24]_. + as described in [42]_. References ---------- - .. [23] Komsta, Ł. Comparison of Several Methods of Chromatographic + .. [41] Komsta, Ł. Comparison of Several Methods of Chromatographic Baseline Removal with a New Approach Based on Quantile Regression. Chromatographia, 2011, 73, 721-731. - .. [24] Schnabel, S., et al. Simultaneous estimation of quantile curves using + .. [42] Schnabel, S., et al. Simultaneous estimation of quantile curves using quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. """ @@ -660,9 +660,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default is 'asymmetric_indec'. Available methods, and their associated reference, are: - * 'asymmetric_indec'[25]_ - * 'asymmetric_truncated_quadratic'[26]_ - * 'asymmetric_huber'[26]_ + * 'asymmetric_indec'[43]_ + * 'asymmetric_truncated_quadratic'[44]_ + * 'asymmetric_huber'[44]_ peak_ratio : float, optional A value between 0 and 1 that designates how many points in the data belong @@ -723,9 +723,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, References ---------- - .. [25] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + .. [43] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline Correction. Applied Spectroscopy, 2015, 69(7), 834-842. - .. [26] Mazet, V., et al. Background removal from spectra by designing and + .. [44] Mazet, V., et al. Background removal from spectra by designing and minimising a non-quadratic cost function. Chemometrics and Intelligent Laboratory Systems, 2005, 76(2), 121-133. diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 6fde768..249e6a7 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -772,7 +772,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline2D.asls`. num_knots : int, optional The number of knots for the spline. Default is 25. spline_degree : int, optional diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index ec7d877..a5fc65f 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -669,7 +669,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline2D.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index e791ccf..d047ff9 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -664,7 +664,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -755,7 +755,7 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -1282,7 +1282,7 @@ def psalsa(data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -1356,7 +1356,7 @@ def derpsalsa(data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, tol=1e-3 values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. From ca2f74ff161b2b91684b602dda2f581eb39b8b8a Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 29 Jan 2024 19:13:21 -0500 Subject: [PATCH 34/56] MAINT: Allow skipping sorting inputs and outputs for 2D Same as 1d, optimizers no longer do two unnecessary sorts. --- pybaselines/two_d/_algorithm_setup.py | 38 +++++-- pybaselines/two_d/optimizers.py | 8 +- tests/conftest.py | 2 +- tests/two_d/test_algorithm_setup.py | 158 ++++++++++++++++++++++---- tests/two_d/test_optimizers.py | 16 +-- 5 files changed, 171 insertions(+), 51 deletions(-) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index ee65509..4503391 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -141,7 +141,7 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=Fa self._dtype = output_dtype def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False, - reshape_baseline=False, reshape_keys=()): + reshape_baseline=False, reshape_keys=(), skip_sorting=False): """ Re-orders the input baseline and parameters based on the x ordering. @@ -158,6 +158,20 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need re-ordering. Default is (). + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), + or (1, M, N). + reshape_baseline : bool, optional + If True, will reshape the output baseline back into the shape of the input data. If + False (default), will not modify the output baseline shape. + reshape_keys : tuple, optional + The keys within the output parameter dictionary that will need reshaped to match the + shape of the data. For example, used to convert weights for polynomials from 1D back + into the original shape. Default is (). + skip_sorting : bool, optional + If True, will skip sorting the output baseline. The keys in `sort_keys` will + still be sorted. Default is False. Returns ------- @@ -183,14 +197,15 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False # assumes params all all two dimensional arrays params[key] = params[key][self._inverted_order] - baseline = _sort_array2d(baseline, sort_order=self._inverted_order) + if not skip_sorting: + baseline = _sort_array2d(baseline, sort_order=self._inverted_order) baseline = baseline.astype(dtype, copy=False) return baseline, params @classmethod def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d=True, - reshape_baseline=False, reshape_keys=()): + reshape_baseline=False, reshape_keys=(), skip_sorting=False): """ Wraps a baseline function to validate inputs and correct outputs. @@ -221,6 +236,9 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d The keys within the output parameter dictionary that will need reshaped to match the shape of the data. For example, used to convert weights for polynomials from 1D back into the original shape. Default is (). + skip_sorting : bool, optional + If True, will skip sorting the output baseline. The keys in `sort_keys` will + still be sorted. Default is False. Returns ------- @@ -233,7 +251,8 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d if func is None: return partial( cls._register, sort_keys=sort_keys, dtype=dtype, order=order, ensure_2d=ensure_2d, - reshape_baseline=reshape_baseline, reshape_keys=reshape_keys + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, + skip_sorting=skip_sorting ) @wraps(func) @@ -283,7 +302,8 @@ def inner(self, data=None, *args, **kwargs): self._len[1] = y.shape[-1] self.z = np.linspace(-1, 1, self._len[1]) - y = _sort_array2d(y, sort_order=self._sort_order) + if not skip_sorting: + y = _sort_array2d(y, sort_order=self._sort_order) if self._dtype is None: output_dtype = y.dtype else: @@ -297,7 +317,8 @@ def inner(self, data=None, *args, **kwargs): return self._return_results( baseline, params, dtype=output_dtype, sort_keys=sort_keys, ensure_2d=ensure_2d, - reshape_baseline=reshape_baseline, reshape_keys=reshape_keys + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, + skip_sorting=skip_sorting ) return inner @@ -878,10 +899,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T else: method_kws = method_kwargs - return ( - _sort_array2d(y, self._inverted_order), baseline_func, func_module, method_kws, - class_object - ) + return y, baseline_func, func_module, method_kws, class_object def _setup_misc(self, y): """ diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py index 9d7aa5e..4af2973 100644 --- a/pybaselines/two_d/optimizers.py +++ b/pybaselines/two_d/optimizers.py @@ -22,7 +22,7 @@ class _Optimizers(_Algorithm2D): """A base class for all optimizer algorithms.""" - @_Algorithm2D._register(ensure_2d=False) + @_Algorithm2D._register(ensure_2d=False, skip_sorting=True) def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): """ Collaborative Penalized Least Squares (collab-PLS). @@ -131,9 +131,9 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No else: params[key] = [value] - return _sort_array2d(baselines, self._sort_order), params + return baselines, params - @_Algorithm2D._register + @_Algorithm2D._register(skip_sorting=True) def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, constrained_fraction=0.01, constrained_weight=1e5, estimation_poly_order=2, method_kwargs=None): @@ -264,7 +264,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, 'poly_order': poly_orders } - return _sort_array2d(np.maximum.reduce(baselines), self._sort_order), params + return np.maximum.reduce(baselines), params def _determine_polyorders(y, poly_order, weights, fit_function, **fit_kwargs): diff --git a/tests/conftest.py b/tests/conftest.py index 16f84f0..efec38a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -716,7 +716,7 @@ def test_xz_ordering(self, assertion_kwargs=None, **kwargs): def reverse_array(self, array): """Reverses the input along the last two dimensions.""" - return array[..., ::-1, ::-1] + return np.asarray(array)[..., ::-1, ::-1] class BasePolyTester2D(BaseTester2D): diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 4cea347..5262862 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -548,7 +548,7 @@ def test_algorithm_return_results(assume_sorted, output_dtype, change_order, res ensure_2d=not three_d ) - assert_allclose(output, expected_baseline, 1e-16, 1e-16) + assert_allclose(output, expected_baseline, 1e-14, 1e-14) assert output.dtype == output_dtype for key, value in expected_params.items(): assert_array_equal(value, output_params[key]) @@ -557,8 +557,9 @@ def test_algorithm_return_results(assume_sorted, output_dtype, change_order, res @pytest.mark.parametrize('assume_sorted', (True, False)) @pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) @pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('skip_sorting', (True, False)) @pytest.mark.parametrize('list_input', (True, False)) -def test_algorithm_register(assume_sorted, output_dtype, change_order, list_input): +def test_algorithm_register(assume_sorted, output_dtype, change_order, skip_sorting, list_input): """ Ensures the _register wrapper method returns the correctly sorted and shaped outputs. @@ -576,16 +577,22 @@ class SubClass(_algorithm_setup._Algorithm2D): @_algorithm_setup._Algorithm2D._register(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) def func(self, data, *args, **kwargs): """For checking sorting and reshaping output parameters.""" - expected_input = y.copy() - if change_order and not assume_sorted: - expected_input = np.asarray(expected_input)[::-1, ::-1] + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] assert isinstance(data, np.ndarray) - assert_allclose(data, expected_input, 1e-16, 1e-16) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) params = { 'a': np.arange(data.size).reshape(data.shape), - 'b': np.arange(len(x)), + 'b': np.arange(len(self.x)), 'c': np.arange(data.size), 'd': np.arange(data.size) } @@ -594,27 +601,67 @@ def func(self, data, *args, **kwargs): @_algorithm_setup._Algorithm2D._register(reshape_baseline=True) def func2(self, data, *args, **kwargs): """For checking reshaping output baseline.""" - expected_input = y.copy() - if change_order and not assume_sorted: - expected_input = np.asarray(expected_input)[::-1, ::-1] + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] assert isinstance(data, np.ndarray) - assert_allclose(data, expected_input, 1e-16, 1e-16) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) return 1 * data.flatten(), {} @_algorithm_setup._Algorithm2D._register def func3(self, data, *args, **kwargs): """For checking empty decorator.""" - expected_input = y.copy() - if change_order and not assume_sorted: - expected_input = np.asarray(expected_input)[::-1, ::-1] + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] assert isinstance(data, np.ndarray) - assert_allclose(data, expected_input, 1e-16, 1e-16) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) return 1 * data, {} + @_algorithm_setup._Algorithm2D._register( + sort_keys=('a', 'd'), reshape_keys=('c', 'd'), skip_sorting=skip_sorting + ) + def func4(self, data, *args, **kwargs): + """For checking skip_sorting key.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and (assume_sorted or skip_sorting): + expected_y = expected_y[::-1, ::-1] + if change_order and assume_sorted: + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(self.x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + + return 1 * data, params + if change_order: x = x[::-1] z = z[::-1] @@ -626,6 +673,10 @@ def func3(self, data, *args, **kwargs): 'd': np.arange(y.size).reshape(y.shape), } expected_baseline = (1 * y).astype(output_dtype) + if output_dtype is None: + expected_dtype = y.dtype + else: + expected_dtype = expected_baseline.dtype if list_input: x = x.tolist() z = z.tolist() @@ -644,21 +695,28 @@ def func3(self, data, *args, **kwargs): # baseline should always match y-order on the output; only sorted within the # function - assert_allclose(output, expected_baseline, 1e-16, 1e-16) + assert_allclose(output, expected_baseline, 1e-14, 1e-14) assert isinstance(output, np.ndarray) - assert output.dtype == output_dtype + assert output.dtype == expected_dtype for key, value in expected_params.items(): assert_array_equal(value, output_params[key], err_msg=f'{key} failed') output2, _ = algorithm.func2(y) - assert_allclose(output2, expected_baseline, 1e-16, 1e-16) + assert_allclose(output2, expected_baseline, 1e-14, 1e-14) assert isinstance(output2, np.ndarray) - assert output2.dtype == output_dtype + assert output2.dtype == expected_dtype output3, _ = algorithm.func3(y) - assert_allclose(output3, expected_baseline, 1e-16, 1e-16) + assert_allclose(output3, expected_baseline, 1e-14, 1e-14) assert isinstance(output3, np.ndarray) - assert output3.dtype == output_dtype + assert output3.dtype == expected_dtype + + output4, output_params4 = algorithm.func4(y) + assert_allclose(output4, expected_baseline, 1e-14, 1e-14) + assert isinstance(output4, np.ndarray) + assert output4.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params4[key], err_msg=f'{key} failed') def test_override_x(algorithm): @@ -707,6 +765,64 @@ def test_get_function_fails_no_module(algorithm): algorithm._get_function('collab_pls', []) +def test_get_function_sorting_x(): + """Ensures the sort order is correct for the output class object when x is reversed.""" + num_points = 10 + x = np.arange(num_points) + ordering = np.arange(num_points) + algorithm = _algorithm_setup._Algorithm2D(x[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.x, x) + assert_array_equal(class_object._sort_order, ordering[::-1]) + assert_array_equal(class_object._inverted_order, ordering[::-1]) + assert_array_equal(class_object._sort_order, algorithm._sort_order) + assert_array_equal(class_object._inverted_order, algorithm._inverted_order) + + +def test_get_function_sorting_z(): + """Ensures the sort order is correct for the output class object when z is reversed.""" + num_points = 10 + z = np.arange(num_points) + ordering = np.arange(num_points) + algorithm = _algorithm_setup._Algorithm2D(None, z[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.z, z) + assert class_object._sort_order[0] is Ellipsis + assert class_object._inverted_order[0] is Ellipsis + assert algorithm._sort_order[0] is Ellipsis + assert algorithm._inverted_order[0] is Ellipsis + assert_array_equal(class_object._sort_order[1], ordering[::-1]) + assert_array_equal(class_object._inverted_order[1], ordering[::-1]) + assert_array_equal(class_object._sort_order[1], algorithm._sort_order[1]) + assert_array_equal(class_object._inverted_order[1], algorithm._inverted_order[1]) + + +def test_get_function_sorting_xz(): + """Ensures the sort order is correct for the output class object when x and z are reversed.""" + num_x_points = 10 + num_z_points = 11 + x = np.arange(num_x_points) + x_ordering = np.arange(num_x_points) + z = np.arange(num_z_points) + z_ordering = np.arange(num_z_points) + + algorithm = _algorithm_setup._Algorithm2D(x[::-1], z[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.x, x) + assert_array_equal(class_object.z, z) + assert_array_equal(class_object._sort_order[0], x_ordering[::-1][:, None]) + assert_array_equal(class_object._sort_order[1], z_ordering[::-1][None, :]) + assert_array_equal(class_object._inverted_order[0], x_ordering[::-1][:, None]) + assert_array_equal(class_object._inverted_order[1], z_ordering[::-1][None, :]) + assert_array_equal(class_object._sort_order[0], algorithm._sort_order[0]) + assert_array_equal(class_object._sort_order[1], algorithm._sort_order[1]) + assert_array_equal(class_object._inverted_order[0], algorithm._inverted_order[0]) + assert_array_equal(class_object._inverted_order[1], algorithm._inverted_order[1]) + + @pytest.mark.parametrize('method_kwargs', (None, {'a': 2})) def test_setup_optimizer(small_data2d, algorithm, method_kwargs): """Ensures output of _setup_optimizer is correct.""" diff --git a/tests/two_d/test_optimizers.py b/tests/two_d/test_optimizers.py index 2a03da8..733c1d6 100644 --- a/tests/two_d/test_optimizers.py +++ b/tests/two_d/test_optimizers.py @@ -56,8 +56,6 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): regular_output, self.reverse_array(reverse_output), **assertion_kwargs ) - return regular_output, regular_output_params, reverse_output, reverse_output_params - class OptimizersTester(BaseTester2D): """Base testing class for optimizer functions.""" @@ -73,7 +71,7 @@ class TestCollabPLS(OptimizersTester, OptimizerInputWeightsMixin): # will need to change checked_keys if default method is changed checked_keys = ('average_weights', 'weights', 'tol_history') three_d = True - weight_keys = ('average_weights',) + weight_keys = ('average_weights', 'weights') @pytest.mark.parametrize( 'method', @@ -98,18 +96,6 @@ def test_single_dataset_fails(self): with pytest.raises(ValueError, match='the input data must'): self.class_func(np.arange(self.y[0].size).reshape(self.y.shape[-2:])) - @pytest.mark.parametrize('average_dataset', (True, False)) - def test_input_weights(self, average_dataset): - """Ensures the input weights are sorted correctly.""" - output = super().test_input_weights(average_dataset=average_dataset) - regular_output, regular_output_params, reverse_output, reverse_output_params = output - - assert_allclose( - regular_output_params['weights'], - self.reverse_array(np.asarray(reverse_output_params['weights'])), - rtol=1e-12, atol=1e-14 - ) - @pytest.mark.parametrize( 'baseline_ptp', (0.01, 0.1, 0.3, 0.5, 1, 5, 10, 40, 100, 200, 300, 500, 600, 1000) From 4c264fd907323e6a9d7e23aded2098204c76f7ce Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:17:55 -0500 Subject: [PATCH 35/56] MAINT: Handle array-like smooth half window for 2d rolling_ball Also fixed the 1d docstring for rolling_ball which mentioned using array-like half windows, which was removed several versions ago. --- pybaselines/morphological.py | 22 ++++------------------ pybaselines/two_d/morphological.py | 16 ++++++---------- tests/two_d/test_morphological.py | 8 +++++++- 3 files changed, 17 insertions(+), 29 deletions(-) diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index c09a943..baea0a9 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -486,15 +486,8 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, dict A dictionary with the following items: - * 'half_window': int or numpy.ndarray(int) - The half window or array of half windows used for the - morphological calculations. - - Notes - ----- - To use a changing window size for either the morphological or smoothing - operations, the half windows must be arrays. Otherwise, the size of the - rolling ball is assumed to be constant. + * 'half_window': int + The half window used for the morphological calculations. References ---------- @@ -1336,15 +1329,8 @@ def rolling_ball(data, half_window=None, smooth_half_window=None, pad_kwargs=Non dict A dictionary with the following items: - * 'half_window': int or numpy.ndarray(int) - The half window or array of half windows used for the - morphological calculations. - - Notes - ----- - To use a changing window size for either the morphological or smoothing - operations, the half windows must be arrays. Otherwise, the size of the - rolling ball is assumed to be constant. + * 'half_window': int + The half window used for the morphological calculations. References ---------- diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index e90f886..4301490 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -11,6 +11,7 @@ from ._algorithm_setup import _Algorithm2D from ..utils import relative_difference +from .._validation import _check_half_window class _Morphological(_Algorithm2D): @@ -191,15 +192,8 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, dict A dictionary with the following items: - * 'half_window': int or numpy.ndarray(int) - The half window or array of half windows used for the - morphological calculations. - - Notes - ----- - To use a changing window size for either the morphological or smoothing - operations, the half windows must be arrays. Otherwise, the size of the - rolling ball is assumed to be constant. + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. References ---------- @@ -213,7 +207,9 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, """ y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) if smooth_half_window is None: - smooth_half_window = half_wind # TODO need to do some verification on smooth_half_window if not None + smooth_half_window = half_wind + else: + smooth_half_window = _check_half_window(smooth_half_window, allow_zero=True, two_d=True) rough_baseline = grey_opening(y, 2 * half_wind + 1) baseline = uniform_filter( diff --git a/tests/two_d/test_morphological.py b/tests/two_d/test_morphological.py index a54fba4..0191dd2 100644 --- a/tests/two_d/test_morphological.py +++ b/tests/two_d/test_morphological.py @@ -66,13 +66,19 @@ def test_unchanged_data(self, new_instance, half_window, smooth_half_window): new_instance, half_window=half_window, smooth_half_window=smooth_half_window ) - @pytest.mark.parametrize('smooth_half_window', (None, 0, 10)) + @pytest.mark.parametrize('smooth_half_window', (None, 0, 10, [0, 0], [10, 10])) def test_smooth_half_windows(self, smooth_half_window): """Ensures smooth-half-window is correctly processed.""" output = self.class_func(self.y, smooth_half_window=smooth_half_window) assert output[0].shape == self.y.shape + @pytest.mark.parametrize('smooth_half_window', (-1, [5, -1], [-1, 5], [-2, -3])) + def test_negative_smooth_half_window_fails(self, smooth_half_window): + """Ensures a negative smooth-half-window raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, smooth_half_window=smooth_half_window) + class TestTophat(MorphologicalTester): """Class for testing tophat baseline.""" From 3adb73e11f44a8e4753a2b1db050f380254504ef Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:56:22 -0500 Subject: [PATCH 36/56] MAINT: Updated all 2D docstrings to show the correct typing --- pybaselines/_algorithm_setup.py | 2 +- pybaselines/optimizers.py | 3 +- pybaselines/spline.py | 6 - pybaselines/two_d/_algorithm_setup.py | 129 +++++++-------- pybaselines/two_d/_spline_utils.py | 41 +++-- pybaselines/two_d/_whittaker_utils.py | 53 +++--- pybaselines/two_d/api.py | 12 +- pybaselines/two_d/morphological.py | 83 +++++----- pybaselines/two_d/optimizers.py | 39 +++-- pybaselines/two_d/polynomial.py | 165 ++++++++----------- pybaselines/two_d/smooth.py | 19 +-- pybaselines/two_d/spline.py | 120 +++++++------- pybaselines/two_d/whittaker.py | 221 ++++++++++++++------------ pybaselines/utils.py | 8 +- pybaselines/whittaker.py | 4 +- 15 files changed, 432 insertions(+), 473 deletions(-) diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 7efe658..88105c3 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -153,7 +153,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), skip_sorting=Fa The baseline output by the baseline function. params : dict The parameter dictionary output by the baseline function. - dtype : [type] + dtype : type or numpy.dtype, optional The desired output dtype for the baseline. sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index a11f30f..a7a535e 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -58,8 +58,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the - :meth:`~pybaselines.whittaker.Whittaker.aspls` or - :meth:`~pybaselines.spline.Spline.pspline_aspls` methods. + :meth:`~Baseline.aspls` or :meth:`~Baseline.pspline_aspls` methods. Additional items depend on the output of the selected method. Every other key will have a list of values, with each item corresponding to a diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 104c8f3..52f7f5a 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -402,9 +402,6 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di weights : array-like, shape (N,), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. Returns ------- @@ -894,9 +891,6 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord weights : array-like, shape (N,), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. Returns ------- diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 4503391..5f536a5 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -34,7 +34,7 @@ class _Algorithm2D: Attributes ---------- - poly_order : int + poly_order : Sequence[int, int] The last polynomial order used for a polynomial algorithm. Initially is -1, denoting that no polynomial fitting has been performed. pspline : PSpline2D or None @@ -50,14 +50,14 @@ class _Algorithm2D: :meth:`_setup_whittaker`). x : numpy.ndarray or None The x-values for the object. If initialized with None, then `x` is initialized the - first function call to have the same size as the input `data.shape[-1]` and has min + first function call to have the same size as the input `data.shape[-2]` and has min and max values of -1 and 1, respectively. x_domain : numpy.ndarray The minimum and maximum values of `x`. If `x_data` is None during initialization, then set to numpy.ndarray([-1, 1]). z : numpy.ndarray or None The z-values for the object. If initialized with None, then `z` is initialized the - first function call to have the same size as the input `data.shape[-2]` and has min + first function call to have the same size as the input `data.shape[-1]` and has min and max values of -1 and 1, respectively. z_domain : numpy.ndarray The minimum and maximum values of `z`. If `z_data` is None during initialization, then @@ -72,11 +72,11 @@ def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=Fa Parameters ---------- - x_data : array-like, shape (N,), optional + x_data : array-like, shape (M,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. - z_data : array-like, shape (M,), optional + z_data : array-like, shape (N,), optional The z-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. @@ -153,7 +153,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False The baseline output by the baseline function. params : dict The parameter dictionary output by the baseline function. - dtype : [type] + dtype : type or numpy.dtype, optional The desired output dtype for the baseline. sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need @@ -385,24 +385,24 @@ def _override_x(self, new_x, new_sort_order=None): self.pspline = old_pspline def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=False, - use_lower=True, use_banded=False, reverse_diags=None): + use_lower=True, use_banded=False): """ Sets the starting parameters for doing penalized least squares. Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M ,N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - lam : float, optional + lam : float or Sequence[float, float], optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method and the differential order. Default is 1. - diff_order : int, optional + diff_order : int or Sequence[int, int], optional The integer differential order; must be greater than 0. Default is 2. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - shape (N,) and all values set to 1. + shape (M, N) and all values set to 1. copy_weights : boolean, optional If True, will copy the array of input weights. Only needed if the algorithm changes the weights in-place. Default is False. @@ -412,17 +412,13 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa use_banded : bool, optional If True, will setup the penalized system using banded matrices. If False, will use sparse matrices. - reverse_diags : {None, False, True}, optional - If True, will reverse the order of the diagonals of the squared difference - matrix. If False, will never reverse the diagonals. If None (default), will - only reverse the diagonals if using pentapy's solver. Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. - weight_array : numpy.ndarray, shape (N,), optional - The weighting array. + y : numpy.ndarray, shape (``M * N``) + The y-values of the measured data after flattening. + weight_array : numpy.ndarray, shape (``M * N``) + The weight array after flattening. Raises ------ @@ -469,14 +465,14 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + shape equal to (M, N) and all values set to 1. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. Default is 2. calc_vander : bool, optional If True, will calculate and the Vandermonde matrix. Default is False. calc_pinv : bool, optional @@ -492,10 +488,10 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. - weight_array : numpy.ndarray, shape (N,) - The weight array for fitting a polynomial to the data. + y : numpy.ndarray, shape (``M * N``) + The y-values of the measured data after flattening. + weight_array : numpy.ndarray, shape (``M * N``) + The weight array for fitting a polynomial to the data after flattening. pseudo_inverse : numpy.ndarray Only returned if `calc_pinv` is True. The pseudo-inverse of the Vandermonde matrix, calculated with singular value decomposition (SVD). @@ -581,23 +577,23 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. - spline_degree : int, optional + shape equal to (M, N) and all values set to 1. + spline_degree : int or Sequence[int, int], optional The degree of the spline. Default is 3, which is a cubic spline. - num_knots : int, optional + num_knots : int or Sequence[int, int], optional The number of interior knots for the splines. Default is 10. penalized : bool, optional Whether the basis matrix should be for a penalized spline or a regular B-spline. Default is True, which creates the basis for a penalized spline. - diff_order : int, optional + diff_order : int or Sequence[int, int], optional The integer differential order for the spline penalty; must be greater than 0. Default is 3. Only used if `penalized` is True. - lam : float, optional + lam : float or Sequence[float, float], optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the number of knots and the difference order. Default is 1. @@ -615,9 +611,9 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. - weight_array : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + weight_array : numpy.ndarray, shape (M, N) The weight array for fitting the spline to the data. Warns @@ -661,10 +657,10 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - half_window : int, optional + half_window : int or Sequence[int, int], optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the half-window size using pybaselines.morphological.optimize_window. @@ -689,10 +685,10 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. - output_half_window : int - The accepted half window size. + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + output_half_window : np.ndarray[int, int] + The accepted half windows. Notes ----- @@ -716,10 +712,10 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, hw_multiplier=2, **pa Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - half_window : int, optional + half_window : int or Sequence[int, int], optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge effects. Default is 0, which provides no padding. @@ -735,10 +731,10 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, hw_multiplier=2, **pa Returns ------- - numpy.ndarray, shape (``N + 2 * half_window``,) + numpy.ndarray, shape (``M + 2 * half_window[0]``, ``N + 2 * half_window[1]`) The padded array of data. - output_hw : int - The accepted half window size. + output_hw : np.ndarray[int, int] + The accepted half windows. """ if half_window is not None: @@ -754,18 +750,18 @@ def _setup_classification(self, y, weights=None): Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. - weight_array : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + weight_array : numpy.ndarray, shape (M, N) The weight array for the data, with boolean dtype. """ @@ -775,7 +771,7 @@ def _setup_classification(self, y, weights=None): ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] - weight_array = weight_array.ravel() + weight_array = weight_array return y, weight_array @@ -849,18 +845,18 @@ def _get_function(self, method, modules): return func, func_module, class_object - def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=True, **kwargs): + def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=True): """ Sets the starting parameters for doing optimizer algorithms. Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. method : str The string name of the desired function, like 'asls'. Case does not matter. - modules : Sequence(module, ...) + modules : Sequence[module, ...] The modules to search for the indicated `method` function. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the fitting function. Default @@ -868,14 +864,11 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T copy_kwargs : bool, optional If True (default), will copy the input `method_kwargs` so that the input dictionary is not modified within the function. - **kwargs - Deprecated in version 0.8.0 and will be removed in version 0.10 or 1.0. Pass any - keyword arguments for the fitting function in the `method_kwargs` dictionary. Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. + y : numpy.ndarray + The y-values of the measured data. baseline_func : Callable The function for fitting the baseline. func_module : str @@ -907,14 +900,14 @@ def _setup_misc(self, y): Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy array by :meth:`~_Algorithm2D._register`. Returns ------- - y : numpy.ndarray, shape (N,) - The y-values of the measured data, converted to a numpy array. + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. Notes ----- diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 459f88b..ae2413f 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -73,18 +73,18 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam The x-values for the spline. z : array-like, shape (M,) The z-values for the spline. - num_knots : int or Sequence(int, int), optional + num_knots : int or Sequence[int, int], optional The number of internal knots for the spline, including the endpoints. Default is 100. - spline_degree : int or Sequence(int, int), optional + spline_degree : int or Sequence[int, int], optional The degree of the spline. Default is 3, which is a cubic spline. check_finite : bool, optional If True, will raise an error if any values in `x` are not finite. Default is False, which skips the check. - lam : float or Sequence(float, float), optional + lam : float or Sequence[float, float], optional The penalty factor applied to the difference matrix. Larger values produce smoother results. Must be greater than 0. Default is 1. - diff_order : int or Sequence(int, int), optional + diff_order : int or Sequence[int, int], optional The difference order of the penalty. Default is 2 (second order difference). Raises @@ -122,10 +122,10 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam 'functions, which is the number of knots + spline degree - 1' )) - el = np.ones((self._num_bases[0], 1)) - ek = np.ones((self._num_bases[1], 1)) - self._G = sparse.kron(self.basis_x, el.T).multiply(sparse.kron(el.T, self.basis_x)) - self._G2 = sparse.kron(self.basis_z, ek.T).multiply(sparse.kron(ek.T, self.basis_z)) + el = np.ones((1, self._num_bases[0])) + ek = np.ones((1, self._num_bases[1])) + self._G = sparse.kron(self.basis_x, el).multiply(sparse.kron(el, self.basis_x)) + self._G2 = sparse.kron(self.basis_z, ek).multiply(sparse.kron(ek, self.basis_z)) def same_basis(self, num_knots=100, spline_degree=3): """ @@ -133,9 +133,9 @@ def same_basis(self, num_knots=100, spline_degree=3): Parameters ---------- - num_knots : int, optional + num_knots : int or Sequence[int, int], optional The number of knots for the new spline. Default is 100. - spline_degree : int, optional + spline_degree : int or Sequence[int, int], optional The degree of the new spline. Default is 3. Returns @@ -163,10 +163,10 @@ def reset_penalty(self, lam=1, diff_order=2): Parameters ---------- - lam : float, optional + lam : float or Sequence[float, float], optional The penalty factor applied to the difference matrix. Larger values produce smoother results. Must be greater than 0. Default is 1. - diff_order : int, optional + diff_order : int or Sequence[int, int], optional The difference order of the penalty. Default is 2 (second order difference). allow_lower : bool, optional If True (default), will allow only using the lower bands of the penalty matrix, @@ -178,12 +178,7 @@ def reset_penalty(self, lam=1, diff_order=2): Notes ----- - `allow_pentapy` is always set to False since the time needed to go from a lower to full - banded matrix and shifting the rows removes any speedup from using pentapy's solver. It - also reduces the complexity of setting up the equations. - - Adds padding to the penalty diagonals to accomodate the different shapes of the spline - basis and the penalty to speed up calculations when the two are added. + `use_banded` is always set to False since the banded structure in 2D is not small. """ self.reset_diagonals(lam, diff_order, use_banded=False) @@ -200,21 +195,21 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The y-values for fitting the spline. - weights : numpy.ndarray, shape (N,) + weights : numpy.ndarray, shape (M, N) The weights for each y-value. - penalty : numpy.ndarray, shape (D, N) + penalty : numpy.ndarray, shape (``M * N``, ``M * N``) The finite difference penalty matrix, in LAPACK's lower banded format (see :func:`scipy.linalg.solveh_banded`) if `lower_only` is True or the full banded format (see :func:`scipy.linalg.solve_banded`) if `lower_only` is False. - rhs_extra : float or numpy.ndarray, shape (N,), optional + rhs_extra : float or numpy.ndarray, shape (``M * N``,), optional If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) of the equation before solving. Default is None, which adds nothing. Returns ------- - numpy.ndarray, shape (N,) + numpy.ndarray, shape (M, N) The spline, corresponding to ``B @ c``, where `c` are the solved spline coefficients and `B` is the spline basis. diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 64c03a1..3a42e5f 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -66,7 +66,10 @@ class PenalizedSystem2D: Attributes ---------- - diff_order : int + banded : bool + If True, the penalty is an array of the bands within the sparse matrix. If False, + the penalty is a sparse matrix. + diff_order : numpy.array([int, int]) The difference order of the penalty. lower : bool If True, the penalty uses only the lower bands of the symmetric banded penalty. Will @@ -115,11 +118,14 @@ def __init__(self, data_size, lam=1, diff_order=2, use_banded=True, use_lower=Tr ---------- data_size : Sequence[int, int] The number of data points for the system. - lam : float, optional - The penalty factor applied to the difference matrix. Larger values produce - smoother results. Must be greater than 0. Default is 1. - diff_order : int, optional - The difference order of the penalty. Default is 2 (second order difference). + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). use_banded : bool, optional If True (default), will do the setup for solving the system using banded matrices rather than sparse matrices. @@ -184,11 +190,14 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): Parameters ---------- - lam : float, optional - The penalty factor applied to the difference matrix. Larger values produce - smoother results. Must be greater than 0. Default is 1. - diff_order : int, optional - The difference order of the penalty. Default is 2 (second order difference). + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). use_banded : bool, optional If True (default), will do the setup for solving the system using banded matrices rather than sparse matrices. @@ -219,7 +228,6 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): self.penalty = penalty_bands if self.lower: self.penalty = self.penalty[self.penalty.shape[0] // 2:] - self._update_bands() else: self.penalty = penalty @@ -284,7 +292,7 @@ def add_diagonal(self, value): Parameters ---------- - value : numpy.ndarray + value : float or numpy.ndarray The diagonal array to add to the penalty matrix. Returns @@ -305,22 +313,3 @@ def reset_diagonal(self): self.penalty[self.main_diagonal_index] = self.main_diagonal else: self.penalty.setdiag(self.main_diagonal) - - def reverse_penalty(self): - """ - Reverses the penalty and original diagonals for the system. - - Raises - ------ - ValueError - Raised if `self.lower` is True, since reversing the half diagonals does - not make physical sense. - - """ - raise NotImplementedError - - if self.lower: - raise ValueError('cannot reverse diagonals when self.lower is True') - self.penalty = self.penalty[::-1] - self.original_diagonals = self.original_diagonals[::-1] - self.reversed = not self.reversed diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index d686356..c59374c 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -20,16 +20,16 @@ class Baseline2D( """ A class for all 2D baseline correction algorithms. - Contains all available baseline correction algorithms in pybaselines as methods to + Contains all available 2D baseline correction algorithms in pybaselines as methods to allow a single interface for easier usage. Parameters ---------- - x_data : array-like, shape (N,), optional + x_data : array-like, shape (M,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. - z_data : array-like, shape (M,), optional + z_data : array-like, shape (N,), optional The z-values of the measured data. Default is None, which will create an array from -1 to 1 during the first function call with length equal to the input data length. @@ -43,7 +43,7 @@ class Baseline2D( Attributes ---------- - poly_order : int + poly_order : Sequence[int, int] The last polynomial order used for a polynomial algorithm. Initially is -1, denoting that no polynomial fitting has been performed. pspline : pybaselines.two_d._spline_utils.PSpline2D or None @@ -57,14 +57,14 @@ class Baseline2D( algorithms. Is None if no Whittaker setup has been performed. x : numpy.ndarray or None The x-values for the object. If initialized with None, then `x` is initialized the - first function call to have the same size as the input `data.shape[-1]` and has min + first function call to have the same size as the input `data.shape[-2]` and has min and max values of -1 and 1, respectively. x_domain : numpy.ndarray The minimum and maximum values of `x`. If `x_data` is None during initialization, then set to numpy.ndarray([-1, 1]). z : numpy.ndarray or None The z-values for the object. If initialized with None, then `z` is initialized the - first function call to have the same size as the input `data.shape[-2]` and has min + first function call to have the same size as the input `data.shape[-1]` and has min and max values of -1 and 1, respectively. z_domain : numpy.ndarray The minimum and maximum values of `z`. If `z_data` is None during initialization, then diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index 4301490..7deed79 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -24,12 +24,13 @@ def mor(self, data, half_window=None, **window_kwargs): Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - half_window : int, optional - The half-window used for the morphology functions. If a value is input, - then that value will be used. Default is None, which will optimize the - half-window size using :func:`.optimize_window` and `window_kwargs`. + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. **window_kwargs Values for setting the half window used for the morphology operations. Items include: @@ -51,13 +52,13 @@ def mor(self, data, half_window=None, **window_kwargs): Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. dict A dictionary with the following items: - * 'half_window': int - The half window used for the morphological calculations. + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. References ---------- @@ -78,12 +79,13 @@ def imor(self, data, half_window=None, tol=1e-3, max_iter=200, **window_kwargs): Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - half_window : int, optional - The half-window used for the morphology functions. If a value is input, - then that value will be used. Default is None, which will optimize the - half-window size using :func:`.optimize_window` and `window_kwargs`. + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional @@ -109,13 +111,13 @@ def imor(self, data, half_window=None, tol=1e-3, max_iter=200, **window_kwargs): Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: - * 'half_window': int - The half window used for the morphological calculations. + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations @@ -153,12 +155,13 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - half_window : int, optional - The half-window used for the morphology functions. If a value is input, - then that value will be used. Default is None, which will optimize the - half-window size using :func:`.optimize_window` and `window_kwargs`. + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. smooth_half_window : int, optional The half-window to use for smoothing the data after performing the morphological operation. Default is None, which will use the same @@ -187,7 +190,7 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. dict A dictionary with the following items: @@ -225,12 +228,13 @@ def tophat(self, data, half_window=None, **window_kwargs): Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - half_window : int, optional - The half-window used for the morphological opening. If a value is input, - then that value will be used. Default is None, which will optimize the - half-window size using :func:`.optimize_window` and `window_kwargs`. + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. **window_kwargs Values for setting the half window used for the morphology operations. Items include: @@ -252,13 +256,13 @@ def tophat(self, data, half_window=None, **window_kwargs): Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. dict A dictionary with the following items: - * 'half_window': int - The half window used for the morphological calculations. + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. Notes ----- @@ -284,17 +288,17 @@ def _avg_opening(y, half_window, opening=None): Parameters ---------- - y : numpy.ndarray, shape (N,) + y : numpy.ndarray, shape (M, N) The array of the measured data. - half_window : int, optional - The half window size to use for the operations. + half_window : numpy.ndarray([int, int]), optional + The half window size for the rows and columns, respectively, to use for the operations. opening : numpy.ndarray, optional The output of scipy.ndimage.grey_opening(y, window_size). Default is None, which will compute the value. Returns ------- - numpy.ndarray, shape (N,) + numpy.ndarray, shape (M, N) The average of the dilation and erosion of the opening. References @@ -303,6 +307,7 @@ def _avg_opening(y, half_window, opening=None): Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. """ + # TODO should find a way to merge this with its 1D counterpart window_size = 2 * half_window + 1 if opening is None: opening = grey_opening(y, window_size) diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py index 4af2973..3404ca2 100644 --- a/pybaselines/two_d/optimizers.py +++ b/pybaselines/two_d/optimizers.py @@ -32,9 +32,9 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No Parameters ---------- - data : array-like, shape (M, N) - An array with shape (M, N) where M is the number of entries in - the dataset and N is the number of data points in each entry. + data : array-like, shape (L, M, N) + An array with shape (L, M, N) where L is the number of entries in + the dataset and (M, N) is the shape of each data entry. average_dataset : bool, optional If True (default) will average the dataset before fitting to get the weighting. If False, will fit each individual entry in the dataset and @@ -48,18 +48,17 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No Returns ------- - baselines : np.ndarray, shape (M, N) + baselines : np.ndarray, shape (L, M, N) An array of all of the baselines. params : dict A dictionary with the following items: - * 'average_weights': numpy.ndarray, shape (N,) + * 'average_weights': numpy.ndarray, shape (M, N) The weight array used to fit all of the baselines. - * 'average_alpha': numpy.ndarray, shape (N,) - Only returned if `method` is 'aspls' or 'pspline_aspls'. The + * 'average_alpha': numpy.ndarray, shape (M, N) + Only returned if `method` is 'aspls'. The `alpha` array used to fit all of the baselines for the - :meth:`~pybaselines.whittaker.Whittaker.aspls` or - :meth:`~pybaselines.spline.Spline.pspline_aspls` methods. + :meth:`~Baseline2D.aspls`. Additional items depend on the output of the selected method. Every other key will have a list of values, with each item corresponding to a @@ -67,7 +66,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No Notes ----- - If `method` is 'aspls' or 'pspline_aspls', `collab_pls` will also calculate + If `method` is 'aspls', `collab_pls` will also calculate the `alpha` array for the entire dataset in the same manner as the weights. References @@ -145,24 +144,24 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Sequence(int, int) or None, optional + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int] or None, optional The two polynomial orders to use for fitting. If a single integer is given, then will use the input value and one plus the input value. Default is None, which will do a preliminary fit using a polynomial of order `estimation_poly_order` and then select the appropriate polynomial orders according to [32]_. method : {'modpoly', 'imodpoly'}, optional The method to use for fitting each polynomial. Default is 'modpoly'. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. - constrained_fraction : float or Sequence(float, float), optional + shape equal to (M, N) and all values set to 1. + constrained_fraction : float or Sequence[float, float], optional The fraction of points at the left and right edges to use for the constrained fit. Default is 0.01. If `constrained_fraction` is a sequence, the first item is the fraction for the left edge and the second is the fraction for the right edge. - constrained_weight : float or Sequence(float, float), optional + constrained_weight : float or Sequence[float, float], optional The weighting to give to the endpoints. Higher values ensure that the end points are fit, but can cause large fluctuations in the other sections of the polynomial. Default is 1e5. If `constrained_weight` is a sequence, @@ -179,14 +178,14 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, Returns ------- - numpy.ndarray, shape (N,) + numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. - * 'constrained_weights': numpy.ndarray, shape (N,) + * 'constrained_weights': numpy.ndarray, shape (M, N) The weight array used for the endpoint-constrained fits. * 'poly_order': numpy.ndarray, shape (2,) An array of the two polynomial orders used for the fitting. diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index 5b9319b..b2f5145 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -36,50 +36,13 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -The function loess was adapted from code from https://gist.github.com/agramfort/850437 -(accessed March 25, 2021), which was licensed under the BSD-3-clause below. - -# Authors: Alexandre Gramfort -# -# License: BSD (3-clause) -Copyright (c) 2015, Alexandre Gramfort -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - """ import numpy as np from .. import _weighting from ._algorithm_setup import _Algorithm2D -from ..utils import ( - _MIN_FLOAT, _convert_coef2d, relative_difference -) +from ..utils import _MIN_FLOAT, _convert_coef2d, relative_difference class _Polynomial(_Algorithm2D): @@ -94,16 +57,17 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. - weights : array-like, shape (N,), optional + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input x_data and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. max_cross: int, optional The maximum degree for the cross terms. For example, if `max_cross` is 1, then @@ -112,14 +76,14 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. - * 'coef': numpy.ndarray, shape (poly_order,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. @@ -155,20 +119,21 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. + data : array-like, shape (M, N) + The y-values of the measured data. x_data : array-like, shape (N,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 with N points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional The maximum number of iterations. Default is 250. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. use_original : bool, optional If False (default), will compare the baseline of each iteration with the y-values of that iteration [33]_ when choosing minimum values. If True, @@ -178,7 +143,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, deviation of the residual is less than measured data [35]_. Default is False. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input x_data and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. max_cross: int, optional The maximum degree for the cross terms. For example, if `max_cross` is 1, then @@ -187,19 +152,19 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (poly_order + 1,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. @@ -267,17 +232,18 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional The maximum number of iterations. Default is 250. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. use_original : bool, optional If False (default), will compare the baseline of each iteration with the y-values of that iteration [36]_ when choosing minimum values. If True, @@ -287,7 +253,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, the standard deviation of the residual is less than measured data [38]_. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input x_data and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. num_std : float, optional The number of standard deviations to include when thresholding. Default @@ -299,19 +265,19 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (poly_order + 1,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. @@ -395,17 +361,18 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. tol : float, optional The exit criteria. Default is 1e-3. max_iter : int, optional The maximum number of iterations. Default is 250. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. cost_function : str, optional The non-quadratic cost function to minimize. Must indicate symmetry of the method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or @@ -432,7 +399,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non 0.99. Typically should not need to change this value. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input x_data and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. max_cross: int, optional The maximum degree for the cross terms. For example, if `max_cross` is 1, then @@ -441,19 +408,19 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (poly_order + 1,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. @@ -527,10 +494,11 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. quantile : float, optional The quantile at which to fit the baseline. Default is 0.05. tol : float, optional @@ -540,16 +508,16 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, The maximum number of iterations. Default is 250. For extreme quantiles (`quantile` < 0.01 or `quantile` > 0.99), may need to use a higher value to ensure convergence. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. eps : float, optional A small value added to the square of the residual to prevent dividing by 0. Default is None, which uses the square of the maximum-absolute-value of the fit each iteration multiplied by 1e-6. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input `x_data` and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. max_cross: int, optional The maximum degree for the cross terms. For example, if `max_cross` is 1, then @@ -558,19 +526,19 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (poly_order + 1,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. @@ -643,17 +611,18 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - poly_order : int or Container[int, int], optional - The polynomial orders for x and z. Default is 2. + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. tol : float, optional The exit criteria for the fitting with a given threshold value. Default is 1e-3. max_iter : int, optional The maximum number of iterations for fitting a threshold value. Default is 250. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with - size equal to N and all values set to 1. + shape equal to (M, N) and all values set to 1. cost_function : str, optional The non-quadratic cost function to minimize. Unlike :func:`.penalized_poly`, this function only works with asymmetric cost functions, so the symmetry prefix @@ -681,7 +650,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Default is 100. return_coef : bool, optional If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the input x_data and return them in the params dictionary. + a form that fits the x and z values and return them in the params dictionary. Default is False, since the conversion takes time. max_cross: int, optional The maximum degree for the cross terms. For example, if `max_cross` is 1, then @@ -690,12 +659,12 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray, shape (J, K) An array containing the calculated tolerance values for each iteration @@ -710,7 +679,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'threshold' : float The optimal threshold value. Could be used in :func:`.penalized_poly` for fitting other similar data. - * 'coef': numpy.ndarray, shape (poly_order + 1,) + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py index 8bbe445..190a59f 100644 --- a/pybaselines/two_d/smooth.py +++ b/pybaselines/two_d/smooth.py @@ -26,16 +26,17 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. - half_window : int, optional - The index-based size to use for the median window. The total window - size will range from [-half_window, ..., half_window] with size - 2 * half_window + 1. Default is None, which will use twice the output from - :func:`.optimize_window`, which is an okay starting value. + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The index-based size to use for the median window on the rows and columns, + respectively. The total window size in each dimension will range from + [-half_window, ..., half_window] with size 2 * half_window + 1. Default is + None, which will use twice the output from :func:`.optimize_window`, + which is an okay starting value. smooth_half_window : int, optional The half window to use for smoothing. Default is None, which will use - the same value as `half_window`. + the average of the values in `half_window`. sigma : float, optional The standard deviation of the smoothing Gaussian kernel. Default is None, which will use (2 * `smooth_half_window` + 1) / 6. @@ -45,7 +46,7 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated and smoothed baseline. dict An empty dictionary, just to match the output of all other algorithms. diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 249e6a7..599b2c9 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -34,31 +34,34 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e5. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline will be given `p - 1` weight. Used to set the initial weights before performing expectation-maximization. Default is 1e-2. - num_knots : int, optional - The number of knots for the spline. Default is 100. - spline_degree : int, optional - The degree of the spline. Default is 3, which is a cubic spline. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 3 - (third order differential matrix). Typical values are 2 or 3. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 3 (third order differential matrix). Typical values are 2 or 3. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1, and then + will be an array with shape equal to (M, N) and all values set to 1, and then two iterations of reweighted least-squares are performed to provide starting weights for the expectation-maximization of the mixture model. symmetric : bool, optional @@ -69,16 +72,16 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di to True when peaks are both positive and negative. num_bins : int, optional The number of bins to use when transforming the residuals into a probability - density distribution. Default is None, which uses ``ceil(sqrt(N))``. + density distribution. Default is None, which uses ``ceil(sqrt(M * N))``. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -104,7 +107,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di data, weights, spline_degree, num_knots, True, diff_order, lam ) # scale y between -1 and 1 so that the residual fit is more numerically stable - y_domain = np.polynomial.polyutils.getdomain(y.flatten()) + y_domain = np.polynomial.polyutils.getdomain(y.ravel()) y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) if weights is not None: @@ -131,7 +134,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di # directly estimates sigma from that, and then calculates the percentages, maybe # that would be faster/more stable? if num_bins is None: - num_bins = ceil(np.sqrt(y.size)) + num_bins = ceil(np.sqrt(self._len[0] * self._len[1])) # uniform probability density distribution for positive residuals, constant # from 0 to max(residual), and 0 for residuals < 0 @@ -180,7 +183,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di # need to clip since a bad initial start can erroneously set the sum of the fractions # of each distribution to > 1 np.clip(posterior_prob, 0, 1, out=posterior_prob) - new_weights = posterior_prob[bin_mapping].reshape(y.shape) # TODO replace with self._shape + new_weights = posterior_prob[bin_mapping].reshape(self._len) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -210,28 +213,31 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e3. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. quantile : float, optional The quantile at which to fit the baseline. Default is 0.05. - num_knots : int, optional - The number of knots for the spline. Default is 25. - spline_degree : int, optional - The degree of the spline. Default is 3, which is a cubic spline. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 3 - (third order differential matrix). Typical values are 3, 2, or 1. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 3 (third order differential matrix). Typical values are 2 or 3. max_iter : int, optional The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-6. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. eps : float, optional A small value added to the square of the residual to prevent dividing by 0. Default is None, which uses the square of the maximum-absolute-value of the @@ -239,12 +245,12 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -293,42 +299,42 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e3. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline will be given `p - 1` weight. Default is 1e-2. - num_knots : int, optional - The number of knots for the spline. Default is 25. - spline_degree : int, optional - The degree of the spline. Default is 3, which is a cubic spline. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 1 or 2. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index a5fc65f..f6fb0e0 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -32,35 +32,36 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline will be given `p - 1` weight. Default is 1e-2. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -115,37 +116,39 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with `N` data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline will be given `p - 1` weight. Default is 1e-2. - lam_1 : float, optional - The smoothing parameter for the first derivative of the residual. Default is 1e-4. + lam_1 : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively, of the first + derivative of the residual. Default is 1e-4. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be set by fitting the data with a second order polynomial. - diff_order : int, optional - The order of the differential matrix. Must be greater than 1. Default is 2 - (second order differential matrix). Typical values are 2 or 3. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 1. + Default is 2 (second order differential matrix). Typical values are 2 or 3. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -209,31 +212,32 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non Parameters ---------- - data : array-like - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -309,31 +313,32 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e5. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -376,12 +381,12 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e5. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. eta : float A term for controlling the value of lam; should be between 0 and 1. Low values will produce smoother baselines, while higher values will @@ -393,18 +398,19 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif weights : array-like, shape (N,), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - diff_order : int, optional - The order of the differential matrix. Must be greater than 1. Default is 2 - (second order differential matrix). Typical values are 2 or 3. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 1. + Default is 2 (second order differential matrix). Typical values are 2 or 3. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -474,31 +480,32 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e5. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -556,37 +563,38 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e5. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. - alpha : array-like, shape (N,), optional + will be an array with shape equal to (M, N) and all values set to 1. + alpha : array-like, shape (M, N), optional An array of values that control the local value of `lam` to better fit peak and non-peak regions. If None (default), then the initial values - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. - * 'alpha': numpy.ndarray, shape (N,) + * 'alpha': numpy.ndarray, shape (M, N) The array of alpha values used for fitting the data in the final iteration. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for @@ -654,12 +662,12 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e Parameters ---------- - data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. - lam : float, optional - The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline @@ -670,25 +678,26 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value will produce similar results to :meth:`~Baseline2D.asls`. - diff_order : int, optional - The order of the differential matrix. Must be greater than 0. Default is 2 - (second order differential matrix). Typical values are 2 or 1. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - weights : array-like, shape (N,), optional + weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights - will be an array with size equal to N and all values set to 1. + will be an array with shape equal to (M, N) and all values set to 1. Returns ------- - baseline : numpy.ndarray, shape (N,) + baseline : numpy.ndarray, shape (M, N) The calculated baseline. params : dict A dictionary with the following items: - * 'weights': numpy.ndarray, shape (N,) + * 'weights': numpy.ndarray, shape (M, N) The weight array used for fitting the data. * 'tol_history': numpy.ndarray An array containing the calculated tolerance values for diff --git a/pybaselines/utils.py b/pybaselines/utils.py index a735b6b..60cf53f 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -571,7 +571,7 @@ def _poly_transform_matrix(num_coefficients, original_domain): ---------- num_coefficients : int The number of polynomial coefficients, ie. the polynomial degree + 1. - original_domain : Container[float, float] + original_domain : Sequence[float, float] The domain, [min(x), max(x)], of the original data used for fitting. Returns @@ -615,7 +615,7 @@ def _convert_coef(coef, original_domain): coef : numpy.ndarray, shape (a,) The array of coefficients for the polynomial. Should increase in order, for example (c0, c1, c2) from `y = c0 + c1 * x + c2 * x**2`. - original_domain : Container[float, float] + original_domain : Sequence[float, float] The domain, [min(x), max(x)], of the original data used for fitting. Returns @@ -653,9 +653,9 @@ def _convert_coef2d(coef, poly_degree_x, poly_degree_z, original_x_domain, origi The polynomial degree for the x-values poly_degree_z : int The polynomial degree for the z-values - original_x_domain : Container[float, float] + original_x_domain : Sequence[float, float] The domain, [min(x), max(x)], of the original x-values used for fitting. - original_z_domain : Container[float, float] + original_z_domain : Sequence[float, float] The domain, [min(z), max(z)], of the original z-values used for fitting. Returns diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index d047ff9..284013a 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -654,7 +654,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + Default is 1e5. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline @@ -1272,7 +1272,7 @@ def psalsa(data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + Default is 1e5. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline From 147f4ab5f95148a3e02409e0985029e9e4be1f0f Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 3 Feb 2024 12:04:57 -0500 Subject: [PATCH 37/56] MAINT: Specify rows and columns for 2D pspline Makes it clearer than using x and z. --- pybaselines/two_d/_spline_utils.py | 46 ++++++++++----------- pybaselines/two_d/_whittaker_utils.py | 6 +-- tests/two_d/test_algorithm_setup.py | 20 ++++----- tests/two_d/test_spline_utils.py | 58 +++++++++++++-------------- tests/two_d/test_whittaker_utils.py | 4 +- 5 files changed, 67 insertions(+), 67 deletions(-) diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index ae2413f..30f2cc7 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -27,22 +27,22 @@ class PSpline2D(PenalizedSystem2D): Attributes ---------- - basis_x : scipy.sparse.csr.csr_matrix, shape (N, P) - The spline basis for x. Has a shape of (`N,` `P`), where `N` is the number of points - in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` + basis_r : scipy.sparse.csr.csr_matrix, shape (N, P) + The spline basis for the rows. Has a shape of (`N,` `P`), where `N` is the number of + points in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` or equivalently ``num_knots[0] + spline_degree[0] - 1``). - basis_z : scipy.sparse.csr.csr_matrix, shape (M, Q) - The spline basis for z. Has a shape of (`M,` `Q`), where `M` is the number of points - in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` + basis_c : scipy.sparse.csr.csr_matrix, shape (M, Q) + The spline basis for the columns. Has a shape of (`M,` `Q`), where `M` is the number of + points in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` or equivalently ``num_knots[1] + spline_degree[1] - 1``). coef : None or numpy.ndarray, shape (M,) The spline coefficients. Is None if :meth:`~PSpline2D.solve_pspline` has not been called at least once. - knots_x : numpy.ndarray, shape (K,) - The knots for the spline. Has a shape of `K`, which is equal to + knots_r : numpy.ndarray, shape (K,) + The knots for the spline along the rows. Has a shape of `K`, which is equal to ``num_knots[0] + 2 * spline_degree[0]``. - knots_z : numpy.ndarray, shape (L,) - The knots for the spline. Has a shape of `L`, which is equal to + knots_c : numpy.ndarray, shape (L,) + The knots for the spline along the columns. Has a shape of `L`, which is equal to ``num_knots[1] + 2 * spline_degree[2]``. num_knots : numpy.ndarray([int, int]) The number of internal knots (including the endpoints) for x and z. The total number of @@ -107,14 +107,14 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam if (self.spline_degree < 0).any(): raise ValueError('spline degree must be >= 0') - self.knots_x = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) - self.basis_x = _spline_basis(self.x, self.knots_x, self.spline_degree[0]) + self.knots_r = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) + self.basis_r = _spline_basis(self.x, self.knots_r, self.spline_degree[0]) - self.knots_z = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) - self.basis_z = _spline_basis(self.z, self.knots_z, self.spline_degree[1]) + self.knots_c = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) + self.basis_c = _spline_basis(self.z, self.knots_c, self.spline_degree[1]) super().__init__( - (self.basis_x.shape[1], self.basis_z.shape[1]), lam, diff_order, use_banded=False + (self.basis_r.shape[1], self.basis_c.shape[1]), lam, diff_order, use_banded=False ) if (self.diff_order >= self._num_bases).any(): raise ValueError(( @@ -124,8 +124,8 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam el = np.ones((1, self._num_bases[0])) ek = np.ones((1, self._num_bases[1])) - self._G = sparse.kron(self.basis_x, el).multiply(sparse.kron(el, self.basis_x)) - self._G2 = sparse.kron(self.basis_z, ek).multiply(sparse.kron(ek, self.basis_z)) + self._G_r = sparse.kron(self.basis_r, el).multiply(sparse.kron(el, self.basis_r)) + self._G_c = sparse.kron(self.basis_c, ek).multiply(sparse.kron(ek, self.basis_c)) def same_basis(self, num_knots=100, spline_degree=3): """ @@ -224,7 +224,7 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): # do not save intermediate results since they are memory intensive for high number of knots F = sparse.csr_matrix( np.transpose( - (self._G.T @ weights @ self._G2).reshape( + (self._G_r.T @ weights @ self._G_c).reshape( (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) ), [0, 2, 1, 3] @@ -235,13 +235,13 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): if penalty is None: penalty = self.penalty - rhs = (self.basis_x.T @ (weights * y) @ self.basis_z).ravel() + rhs = (self.basis_r.T @ (weights * y) @ self.basis_c).ravel() if rhs_extra is not None: rhs = rhs + rhs_extra self.coef = spsolve(F + penalty, rhs, permc_spec='NATURAL') - output = self.basis_x @ self.coef.reshape(self._num_bases) @ self.basis_z.T + output = self.basis_r @ self.coef.reshape(self._num_bases) @ self.basis_c.T return output @@ -255,7 +255,7 @@ def basis(self): """ if self._basis is None: - self._basis = sparse.kron(self.basis_x, self.basis_z) + self._basis = sparse.kron(self.basis_r, self.basis_c) return self._basis @property @@ -264,7 +264,7 @@ def tck(self): The knots, spline coefficients, and spline degree to reconstruct the spline. Convenience function for potentially reconstructing the last solved spline with outside - modules, although not such if Scipy has a 2D equiavlent to its `BSpline`. + modules, although not sure if Scipy has a 2D equiavlent to its `BSpline`. Raises ------ @@ -276,5 +276,5 @@ def tck(self): if self.coef is None: raise ValueError('No spline coefficients, need to call "solve_pspline" first.') return ( - self.knots_x, self.knots_z, self.coef, self.spline_degree[0], self.spline_degree[1] + self.knots_r, self.knots_c, self.coef, self.spline_degree[0], self.spline_degree[1] ) diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 3a42e5f..8da0d57 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -214,9 +214,9 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): penalty_columns = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam - P1 = kron(self.lam[0] * penalty_rows, identity(self._num_bases[1])) - P2 = kron(identity(self._num_bases[0]), self.lam[1] * penalty_columns) - penalty = P1 + P2 + P_rows = kron(self.lam[0] * penalty_rows, identity(self._num_bases[1])) + P_columns = kron(identity(self._num_bases[0]), self.lam[1] * penalty_columns) + penalty = P_rows + P_columns if self.banded: penalty = penalty.todia() sparse_bands = (penalty).data diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 5262862..80072e3 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -234,10 +234,10 @@ def test_setup_spline_spline_basis(data_fixture2d, num_knots, spline_degree): ) if isinstance(num_knots, int): - num_knots_x = num_knots - num_knots_z = num_knots + num_knots_r = num_knots + num_knots_c = num_knots else: - num_knots_x, num_knots_z = num_knots + num_knots_r, num_knots_c = num_knots if isinstance(spline_degree, int): spline_degree_x = spline_degree spline_degree_z = spline_degree @@ -245,12 +245,12 @@ def test_setup_spline_spline_basis(data_fixture2d, num_knots, spline_degree): spline_degree_x, spline_degree_z = spline_degree assert_array_equal( - fitter.pspline.basis_x.shape, - (len(x), num_knots_x + spline_degree_x - 1) + fitter.pspline.basis_r.shape, + (len(x), num_knots_r + spline_degree_x - 1) ) assert_array_equal( - fitter.pspline.basis_z.shape, - (len(z), num_knots_z + spline_degree_z - 1) + fitter.pspline.basis_c.shape, + (len(z), num_knots_c + spline_degree_z - 1) ) @@ -269,14 +269,14 @@ def test_setup_spline_diff_matrix(data_fixture2d, lam, diff_order, spline_degree ) ( - num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, lam_x, lam_z, diff_order_x, diff_order_z ) = get_2dspline_inputs( num_knots=num_knots, spline_degree=spline_degree, lam=lam, diff_order=diff_order ) - num_bases_x = num_knots_x + spline_degree_x - 1 - num_bases_z = num_knots_z + spline_degree_z - 1 + num_bases_x = num_knots_r + spline_degree_x - 1 + num_bases_z = num_knots_c + spline_degree_z - 1 D1 = difference_matrix(num_bases_x, diff_order_x) D2 = difference_matrix(num_bases_z, diff_order_z) diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 8c72505..f7a6b27 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -38,17 +38,17 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la """ x, z, y = data_fixture2d ( - num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, lam_x, lam_z, diff_order_x, diff_order_z ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) - knots_x = _spline_utils._spline_knots(x, num_knots_x, spline_degree_x, True) - basis_x = _spline_utils._spline_basis(x, knots_x, spline_degree_x) + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) - knots_z = _spline_utils._spline_knots(z, num_knots_z, spline_degree_z, True) - basis_z = _spline_utils._spline_basis(z, knots_z, spline_degree_z) + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) - num_bases = (basis_x.shape[1], basis_z.shape[1]) + num_bases = (basis_r.shape[1], basis_c.shape[1]) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 weights = np.random.RandomState(0).normal(0.8, 0.05, y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) @@ -56,7 +56,7 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la # note: within Eiler's paper, the basis was defined as kron(basis_z, basis_x), # but the rows and columns were switched, ie. it should be kron(basis_rows, basis_columns), # so it is just a nomenclature difference - basis = kron(basis_x, basis_z) + basis = kron(basis_r, basis_c) CWT = basis.multiply( np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) ).T @@ -93,17 +93,17 @@ def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam """Ensure the PSpline2D setup is correct.""" x, z, y = data_fixture2d ( - num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, lam_x, lam_z, diff_order_x, diff_order_z ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) - knots_x = _spline_utils._spline_knots(x, num_knots_x, spline_degree_x, True) - basis_x = _spline_utils._spline_basis(x, knots_x, spline_degree_x) + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) - knots_z = _spline_utils._spline_knots(z, num_knots_z, spline_degree_z, True) - basis_z = _spline_utils._spline_basis(z, knots_z, spline_degree_z) + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) - num_bases = (basis_x.shape[1], basis_z.shape[1]) + num_bases = (basis_r.shape[1], basis_c.shape[1]) D1 = difference_matrix(num_bases[0], diff_order_x) D2 = difference_matrix(num_bases[1], diff_order_z) @@ -117,37 +117,37 @@ def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam lam=lam, diff_order=diff_order, check_finite=False ) - assert pspline.basis_x.shape == (len(x), len(knots_x) - spline_degree_x - 1) - assert pspline.basis_z.shape == (len(z), len(knots_z) - spline_degree_z - 1) + assert pspline.basis_r.shape == (len(x), len(knots_r) - spline_degree_x - 1) + assert pspline.basis_c.shape == (len(z), len(knots_c) - spline_degree_z - 1) assert_array_equal(pspline._num_bases, num_bases) - assert issparse(pspline.basis_x) - assert issparse(pspline.basis_z) + assert issparse(pspline.basis_r) + assert issparse(pspline.basis_c) - assert_allclose(pspline.basis_x.toarray(), basis_x.toarray(), rtol=1e-12, atol=1e-12) - assert_allclose(pspline.basis_z.toarray(), basis_z.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.basis_r.toarray(), basis_r.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.basis_c.toarray(), basis_c.toarray(), rtol=1e-12, atol=1e-12) assert_allclose(pspline.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12) assert_array_equal(pspline.diff_order, (diff_order_x, diff_order_z)) - assert_array_equal(pspline.num_knots, (num_knots_x, num_knots_z)) + assert_array_equal(pspline.num_knots, (num_knots_r, num_knots_c)) assert_array_equal(pspline.spline_degree, (spline_degree_x, spline_degree_z)) assert_array_equal(pspline.lam, (lam_x, lam_z)) assert pspline.coef is None # None since the solve method has not been called - assert pspline.basis_x.shape == (len(x), num_knots_x + spline_degree_x - 1) - assert pspline.basis_z.shape == (len(z), num_knots_z + spline_degree_z - 1) + assert pspline.basis_r.shape == (len(x), num_knots_r + spline_degree_x - 1) + assert pspline.basis_c.shape == (len(z), num_knots_c + spline_degree_z - 1) assert_array_equal( pspline._num_bases, - (num_knots_x + spline_degree_x - 1, num_knots_z + spline_degree_z - 1) + (num_knots_r + spline_degree_x - 1, num_knots_c + spline_degree_z - 1) ) - assert pspline.knots_x.shape == (num_knots_x + 2 * spline_degree_x,) - assert pspline.knots_z.shape == (num_knots_z + 2 * spline_degree_z,) + assert pspline.knots_r.shape == (num_knots_r + 2 * spline_degree_x,) + assert pspline.knots_c.shape == (num_knots_c + 2 * spline_degree_z,) assert isinstance(pspline.x, np.ndarray) assert isinstance(pspline.z, np.ndarray) # _basis should be None since the basis attribute has not been accessed yet assert pspline._basis is None - expected_basis = kron(basis_x, basis_z).toarray() + expected_basis = kron(basis_r, basis_c).toarray() assert_allclose(pspline.basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) assert_allclose(pspline._basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) @@ -229,10 +229,10 @@ def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): # ensure tck is the knots, coefficients, and spline degree assert len(pspline.tck) == 5 - knots_x, knots_z, coeffs, degree_x, degree_z = pspline.tck + knots_r, knots_c, coeffs, degree_x, degree_z = pspline.tck - assert_allclose(knots_x, pspline.knots_x, rtol=1e-12) - assert_allclose(knots_z, pspline.knots_z, rtol=1e-12) + assert_allclose(knots_r, pspline.knots_r, rtol=1e-12) + assert_allclose(knots_c, pspline.knots_c, rtol=1e-12) assert_allclose(coeffs, pspline.coef, rtol=1e-12) if isinstance(spline_degree, int): assert degree_x == spline_degree diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 15cc7c3..9957f51 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -157,8 +157,8 @@ def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded, use_lo ) # sanity check to ensure it was set up correctly - assert_array_equal(pspline.basis_x.shape, (len(x), len(x))) - assert_array_equal(pspline.basis_z.shape, (len(z)), len(z)) + assert_array_equal(pspline.basis_r.shape, (len(x), len(x))) + assert_array_equal(pspline.basis_c.shape, (len(z)), len(z)) whittaker_system = _whittaker_utils.PenalizedSystem2D( y.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, use_lower=use_lower From 72855bcd422ce0c4e7304f4237d3e5a598a10faa Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 3 Feb 2024 15:15:12 -0500 Subject: [PATCH 38/56] MAINT: Remove the 2d version of goldindec The exit criteria is based on a value from the publication, which does not make sense to apply for 2d data. --- pybaselines/two_d/polynomial.py | 199 -------------------------------- tests/two_d/test_polynomial.py | 165 -------------------------- 2 files changed, 364 deletions(-) diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index b2f5145..2303426 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -597,205 +597,6 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) - def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - cost_function='asymmetric_indec', peak_ratio=0.5, alpha_factor=0.99, - tol_2=1e-3, tol_3=1e-6, max_iter_2=100, return_coef=False, max_cross=None): - """ - Fits a polynomial baseline using a non-quadratic cost function. - - The non-quadratic cost functions penalize residuals with larger values, - giving a more robust fit compared to normal least-squares. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - poly_order : int or Sequence[int, int], optional - The polynomial orders for x and z. If a single value, will use that for both x and - z. Default is 2. - tol : float, optional - The exit criteria for the fitting with a given threshold value. Default is 1e-3. - max_iter : int, optional - The maximum number of iterations for fitting a threshold value. Default is 250. - weights : array-like, shape (M, N), optional - The weighting array. If None (default), then will be an array with - shape equal to (M, N) and all values set to 1. - cost_function : str, optional - The non-quadratic cost function to minimize. Unlike :func:`.penalized_poly`, - this function only works with asymmetric cost functions, so the symmetry prefix - ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default - is 'asymmetric_indec'. Available methods, and their associated reference, are: - - * 'asymmetric_indec'[43]_ - * 'asymmetric_truncated_quadratic'[44]_ - * 'asymmetric_huber'[44]_ - - peak_ratio : float, optional - A value between 0 and 1 that designates how many points in the data belong - to peaks. Values are valid within ~10% of the actual peak ratio. Default is 0.5. - alpha_factor : float, optional - A value between 0 and 1 that controls the value of the penalty. Default is - 0.99. Typically should not need to change this value. - tol_2 : float, optional - The exit criteria for the difference between the optimal up-down ratio (number of - points above 0 in the residual compared to number of points below 0) and the up-down - ratio for a given threshold value. Default is 1e-3. - tol_3 : float, optional - The exit criteria for the relative change in the threshold value. Default is 1e-6. - max_iter_2 : float, optional - The number of iterations for iterating between different threshold values. - Default is 100. - return_coef : bool, optional - If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the x and z values and return them in the params dictionary. - Default is False, since the conversion takes time. - max_cross: int, optional - The maximum degree for the cross terms. For example, if `max_cross` is 1, then - `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which - does not limit the cross terms. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - params : dict - A dictionary with the following items: - - * 'weights': numpy.ndarray, shape (M, N) - The weight array used for fitting the data. - * 'tol_history': numpy.ndarray, shape (J, K) - An array containing the calculated tolerance values for each iteration - of both threshold values and fit values. Index 0 are the tolerence values - for the difference in up-down ratios, index 1 are the tolerance values for - the relative change in the threshold, and indices >= 2 are the tolerance values - for each fit. All values that were not used in fitting have values of 0. Shape J - is 2 plus the number of iterations for the threshold to converge (related to - `max_iter_2`, `tol_2`, `tol_3`), and shape K is the maximum of the number of - iterations for the threshold and the maximum number of iterations for all of - the fits of the various threshold values (related to `max_iter` and `tol`). - * 'threshold' : float - The optimal threshold value. Could be used in :func:`.penalized_poly` - for fitting other similar data. - * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) - Only if `return_coef` is True. The array of polynomial parameters - for the baseline, in increasing order. Can be used to create a - polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. - - Raises - ------ - ValueError - Raised if `alpha_factor` or `peak_ratio` are not between 0 and 1, or if the - specified cost function is symmetric. - - References - ---------- - .. [43] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline - Correction. Applied Spectroscopy, 2015, 69(7), 834-842. - .. [44] Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - if not 0 < alpha_factor <= 1: - raise ValueError('alpha_factor must be between 0 and 1') - elif not 0 < peak_ratio < 1: - raise ValueError('peak_ratio must be between 0 and 1') - try: - symmetric_loss, method = _identify_loss_method(cost_function) - except ValueError: # do not require a prefix since cost must be asymmetric - symmetric_loss, method = _identify_loss_method('a_' + cost_function) - if symmetric_loss: - # symmetric cost functions don't work due to how the up-down ratio vs - # peak_ratio function was created in the reference; in theory, could simulate - # spectra with both positive and negative peaks following the reference - # and build another empirical function, but would likely need to also - # add other parameters detailing the percent of positive vs negative peaks, - # etc., so it's not worth the effort - raise ValueError('goldindec only works for asymmetric cost functions') - - loss_function = { - 'huber': _huber_loss, - 'truncated_quadratic': _truncated_quadratic_loss, - 'indec': _indec_loss - }[method] - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross - ) - up_down_ratio_goal = ( - 0.7679 + 11.2358 * peak_ratio - 39.7064 * peak_ratio**2 + 92.3583 * peak_ratio**3 - ) - # TODO reference states threshold must be <= 2 for half-quadratic minimization to - # be valid for indec cost function, and normalized y so that threshold is always <= 2; - # however, it seems to work fine without normalization; just be aware in case errors - # occur, may have to normalize y in both this function and penalized_poly - sqrt_w = np.sqrt(weight_array) - y_fit = sqrt_w * y - - coef = pseudo_inverse @ y_fit - initial_baseline = self.vandermonde @ coef - - a = 0 - # reference used b=1, but normalized y before fitting; instead, set b as max of - # initial residual - b = abs((y - initial_baseline).max()) - threshold = a + 0.618 * (b - a) - loss_kwargs = { - 'threshold': threshold, 'alpha_factor': alpha_factor, - 'symmetric': symmetric_loss - } - # have to use zeros rather than empty for tol_history since each inner fit may - # have a different number of iterations - tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2))) - j_max = 0 - for i in range(max_iter_2): - baseline = initial_baseline - for j in range(max_iter): - baseline_old = baseline - coef = pseudo_inverse @ ( - y_fit + loss_function(y_fit - sqrt_w * baseline, **loss_kwargs) - ) - baseline = self.vandermonde @ coef - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i + 2, j] = calc_difference - if calc_difference < tol: - break - if j > j_max: - j_max = j - - up_count = (y > baseline).sum() - up_down_ratio = up_count / max(1, self._len[0] * self._len[1] - up_count) - calc_difference = up_down_ratio - up_down_ratio_goal - tol_history[0, i] = calc_difference - if calc_difference > tol_2: - a = threshold - elif calc_difference < -tol_2: - b = threshold - else: - break - threshold = a + 0.618 * (b - a) - # this exit criteria was not stated in the reference, but the change in threshold - # becomes zero fairly quickly, so need to also exit rather than needlessly - # continuing to calculate with the same threshold value - calc_difference = relative_difference(loss_kwargs['threshold'], threshold) - tol_history[1, i] = calc_difference - if calc_difference < tol_3: - break - loss_kwargs['threshold'] = threshold - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 3, :max(i, j_max) + 1], - 'threshold': loss_kwargs['threshold'] - } - if return_coef: - params['coef'] = _convert_coef2d( - coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain - ) - - return baseline, params - # adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); # see license above diff --git a/tests/two_d/test_polynomial.py b/tests/two_d/test_polynomial.py index e6bff56..ef84ad4 100644 --- a/tests/two_d/test_polynomial.py +++ b/tests/two_d/test_polynomial.py @@ -264,168 +264,3 @@ def test_compare_to_statsmodels(self, quantile): assert_allclose( output[0].ravel(), STATSMODELS_QUANTILES_2D[quantile], rtol=1e-5, atol=1e-10 ) - - -class TestGoldindec(PolynomialTester): - """Class for testing goldindec baseline.""" - - func_name = 'goldindec' - checked_keys = ('weights', 'tol_history', 'threshold') - - @pytest.mark.parametrize('new_instance', (True, False)) - @pytest.mark.parametrize( - 'cost_function', - ( - 'asymmetric_truncated_quadratic', - 'a_truncated_quadratic', - 'asymmetric_huber', - 'asymmetric_indec', - 'indec', - 'huber', - 'truncated_quadratic' - ) - ) - def test_unchanged_data(self, new_instance, cost_function): - """Ensures that input data is unchanged by the function.""" - super().test_unchanged_data(new_instance, cost_function=cost_function) - - @pytest.mark.parametrize('cost_function', ('p_huber', '')) - def test_unknown_cost_function_prefix_fails(self, cost_function): - """Ensures cost function with no prefix or a wrong prefix fails.""" - with pytest.raises(KeyError): - self.class_func(self.y, cost_function=cost_function) - - @pytest.mark.parametrize('cost_function', ('s_huber', 's_indec', 'symmetric_indec')) - def test_symmetric_cost_function_fails(self, cost_function): - """Ensures a symmetric cost function fails.""" - with pytest.raises(ValueError): - self.class_func(self.y, cost_function=cost_function) - - def test_unknown_cost_function_fails(self): - """Ensures than an unknown cost function fails.""" - with pytest.raises(KeyError): - self.class_func(self.y, cost_function='a_hub') - - @pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) - def test_weighting(self, weight_enum): - """ - Tests that weighting is correctly applied by comparing to other algorithms. - - Weights were not included in the original goldindec method, so need to ensure - that their usage in pybaselines is correct. - - For uniform weights, the reference baseline is simply the unweighted calculation, - since they should be equal. For non-uniform weights, compare to the output of - penalized_poly, whose weighting is correctly tested, using the output optimal - threshold. - - """ - if weight_enum == 0: - # all weights = 1 - weights = None - uniform_weights = True - elif weight_enum == 1: - # same as all weights = 1, but would cause issues if weights were - # incorrectly multiplied - weights = np.full_like(self.y, 2) - uniform_weights = True - elif weight_enum == 2: - # binary mask, only fitting the first half of the data - weights = np.ones_like(self.y) - weights[self.x < 0.5 * (np.max(self.x) + np.min(self.x))] = 0 - uniform_weights = False - else: - # weight array where the two endpoints have weighting >> 1 - weights = np.ones_like(self.y) - fraction = max(1, ceil(self.y.shape[0] * 0.1)) - weights[:fraction] = 100 - weights[-fraction:] = 100 - uniform_weights = False - - poly_order = 2 - fit_baseline, params = self.class_func(self.y, poly_order=poly_order, weights=weights) - if uniform_weights: - reference_baseline = self.class_func(self.y, poly_order=poly_order)[0] - else: - reference_baseline = polynomial._Polynomial(self.x, self.z).penalized_poly( - self.y, poly_order=poly_order, weights=weights, - threshold=params['threshold'], cost_function='a_indec' - )[0] - - assert_allclose(fit_baseline, reference_baseline) - - @pytest.mark.parametrize('exit_enum', (0, 1, 2, 3)) - def test_tol_history(self, exit_enum): - """ - Ensures the 'tol_history' item in the parameter output is correct. - - Since the shape of 'tol_history' is dictated by the number of iterations - completed for fitting each threshold value and for iterating between - threshold values, need to ensure each exit criteria works independently. - - """ - if exit_enum == 0: - # inner fitting does more iterations - max_iter = 15 - tol = -1 - max_iter_2 = 10 - tol_2 = 0 - tol_3 = -1 - - expected_shape_0 = max_iter_2 + 2 - expected_shape_1 = max_iter - - if exit_enum == 1: - # outer fitting does more iterations - max_iter = 15 - tol = 1e6 - max_iter_2 = 10 - tol_2 = 0 - tol_3 = -1 - - expected_shape_0 = max_iter_2 + 2 - expected_shape_1 = max_iter_2 - - if exit_enum == 2: - # only one iteration completed; exits due to tol_2 - max_iter = 15 - tol = 1e6 - max_iter_2 = 10 - tol_2 = 1e6 - tol_3 = -1 - - expected_shape_0 = 3 - expected_shape_1 = 1 - - if exit_enum == 3: - # only one iteration completed; exits due to tol_3 - max_iter = 15 - tol = 1e6 - max_iter_2 = 10 - tol_2 = 0 - tol_3 = 1e6 - - expected_shape_0 = 3 - expected_shape_1 = 1 - - _, params = self.class_func( - self.y, max_iter=max_iter, tol=tol, max_iter_2=max_iter_2, - tol_2=tol_2, tol_3=tol_3 - ) - - assert params['tol_history'].shape[0] == expected_shape_0 - assert params['tol_history'].shape[1] == expected_shape_1 - - @pytest.mark.parametrize('alpha_factor', (-0.1, 0, 1.01)) - def test_wrong_alpha_factor_fails(self, alpha_factor): - """Ensures an alpha factor outside of (0, 1] fails.""" - with pytest.raises(ValueError): - self.class_func(self.y, alpha_factor=alpha_factor) - - @pytest.mark.parametrize('peak_ratio', (-0.1, 0, 1, 1.01)) - def test_wrong_peak_ratio_fails(self, peak_ratio): - """Ensures a peak ratio outside of (0, 1) fails.""" - with pytest.raises(ValueError): - self.class_func(self.y, peak_ratio=peak_ratio) - - From b288ddb66ba4ef7ffad63d73c3d5d1b27156d9d7 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 4 Feb 2024 15:58:13 -0500 Subject: [PATCH 39/56] MAINT: Update CI and drop python 3.6 and 3.7 --- .github/workflows/python-test.yml | 31 +++++++++++++------------------ setup.cfg | 10 ++++------ 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 6b7ffcd..5857446 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -26,25 +26,25 @@ on: jobs: test: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: # Use strings since yaml considers 3.10 equal to 3.1 - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install required dependencies run: | python -m pip install --upgrade pip - python -m pip install "numpy>=1.14" "scipy>=1.0" pytest + python -m pip install "numpy>=1.18" "scipy>=1.5" pytest # Only lint a single version; pick a recent, stable version - name: Install linting dependencies @@ -67,8 +67,8 @@ jobs: - name: Install optional dependencies id: install-optional # uncomment below in case this step ever needs skipped again - if: matrix.python-version != '3.12' - run: python -m pip install "pentapy>=1.0" "numba>=0.45" + #if: matrix.python-version != '3.13' + run: python -m pip install "pentapy>=1.0" "numba>=0.49" - name: Test with optional dependencies # uncomment below in case this step ever needs skipped again @@ -78,35 +78,30 @@ jobs: test-min-dependencies: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: ['3.6'] + python-version: ['3.8'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install minimum dependencies - # Use numpy 1.14.5 rather than 1.14.0 since the optional - # dependency pentapy requires numpy>=1.14.5; no relevant difference - # between 1.14.0 and 1.14.5. run: | python -m pip install --upgrade pip - python -m pip install numpy==1.14.5 scipy==1.0 pytest + python -m pip install numpy==1.18 scipy==1.5 pytest - name: Test with minimum required dependencies run: pytest . - name: Install minimum optional dependencies - # Have to pin llvmlite to 0.30.0 since it otherwise gets a more recent - # version that is imcompatible with numba v0.45 - run: python -m pip install pentapy==1.0 numba==0.45 llvmlite==0.30.0 + run: python -m pip install pentapy==1.0 numba==0.49 - name: Test with minimum optional dependencies run: pytest . diff --git a/setup.cfg b/setup.cfg index 9b476ca..0cfcf05 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,8 +17,6 @@ classifiers = License :: OSI Approved :: BSD License Operating System :: OS Independent Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -45,16 +43,16 @@ project_urls = [options] packages = find: include_package_data = True -python_requires = >=3.6 +python_requires = >=3.8 install_requires = - numpy>=1.17 - scipy>=1.0 + numpy>=1.18 + scipy>=1.5 zip_safe = False [options.extras_require] full = pentapy>=1.0 - numba>=0.45 + numba>=0.49 [options.packages.find] include = pybaselines, pybaselines.* From 3baaa9753347d3428b3023ca533fb2161954a966 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 4 Feb 2024 16:01:07 -0500 Subject: [PATCH 40/56] DOCS: Add algorithm section for 2D --- docs/_templates/autoapi/python/package.rst | 34 +-- docs/algorithms/index.rst | 20 +- docs/algorithms/polynomial.rst | 2 +- docs/algorithms/whittaker.rst | 26 +-- docs/algorithms_2d/index.rst | 26 +++ docs/algorithms_2d/morphological_2d.rst | 123 +++++++++++ docs/algorithms_2d/optimizers_2d.rst | 77 +++++++ docs/algorithms_2d/polynomial_2d.rst | 159 ++++++++++++++ docs/algorithms_2d/smooth_2d.rst | 76 +++++++ docs/algorithms_2d/spline_2d.rst | 182 ++++++++++++++++ docs/algorithms_2d/whittaker_2d.rst | 228 +++++++++++++++++++++ docs/index.rst | 1 + pybaselines/spline.py | 22 +- pybaselines/two_d/__init__.py | 1 - pybaselines/two_d/spline.py | 12 +- pybaselines/utils.py | 26 ++- tests/test_utils.py | 35 +++- 17 files changed, 992 insertions(+), 58 deletions(-) create mode 100644 docs/algorithms_2d/index.rst create mode 100644 docs/algorithms_2d/morphological_2d.rst create mode 100644 docs/algorithms_2d/optimizers_2d.rst create mode 100644 docs/algorithms_2d/polynomial_2d.rst create mode 100644 docs/algorithms_2d/smooth_2d.rst create mode 100644 docs/algorithms_2d/spline_2d.rst create mode 100644 docs/algorithms_2d/whittaker_2d.rst diff --git a/docs/_templates/autoapi/python/package.rst b/docs/_templates/autoapi/python/package.rst index 592ccd0..29155bc 100644 --- a/docs/_templates/autoapi/python/package.rst +++ b/docs/_templates/autoapi/python/package.rst @@ -14,22 +14,6 @@ {% endif %} -{% block subpackages %} -{% set visible_subpackages = obj.subpackages|selectattr("display")|list %} -{% if visible_subpackages %} -Subpackages ------------ -.. toctree:: - :titlesonly: - :maxdepth: 3 - -{% for subpackage in visible_subpackages %} - {{ subpackage.short_name }}/index.rst -{% endfor %} - - -{% endif %} -{% endblock %} {% block submodules %} {% set visible_submodules = obj.submodules|selectattr("display")|list %} {% if visible_submodules %} @@ -47,6 +31,24 @@ Submodules {% endif %} {% endblock %} + +{% block subpackages %} +{% set visible_subpackages = obj.subpackages|selectattr("display")|list %} +{% if visible_subpackages %} +Subpackages +----------- +.. toctree:: + :titlesonly: + :maxdepth: 3 + +{% for subpackage in visible_subpackages %} + {{ subpackage.short_name }}/index.rst +{% endfor %} + + +{% endif %} +{% endblock %} + {% block content %} {% if obj.all is not none %} {% set visible_children = obj.children|selectattr("short_name", "in", obj.all)|list %} diff --git a/docs/algorithms/index.rst b/docs/algorithms/index.rst index c4416d2..2b044df 100644 --- a/docs/algorithms/index.rst +++ b/docs/algorithms/index.rst @@ -2,15 +2,23 @@ Algorithms ========== -The currently available baseline correction algorithms in pybaselines are split into -polynomial, whittaker, morphological, smooth, spline, classification, optimizers, -and miscellaneous (misc). Note that this is more for grouping code and not meant as -a hard-classification of the algorithms or the general field of baseline correction. +The currently available baseline correction algorithms in pybaselines can broadly be categorized +as polynomial, whittaker, morphological, smooth, spline, classification, optimizers, +and miscellaneous (misc) methods. Note that this is simply for grouping code and helping to +explain the internals of this library and **NOT** meant as a hard-classification of the +field of baseline correction (Please stop blindly copying this section in papers. There are +numerous types of baseline correction algorithms that are not included within pybaselines, which +is why baseline correction in general is such an absolutely fascinating field! Besides, miscellaneous +is obviously not an actual type of baseline correction...) This section of the documentation is to help provide some context for each algorithm. In addition, most algorithms will have a figure that shows how well the algorithm fits -various baselines to help choose the correct algorithm for a particular baseline. Refer -to the :doc:`API section <../api/index>` of the documentation for the full parameter and +various datasets to help choose the correct algorithm for a particular baseline. These datasets +include noisy data, data with both positive and negative peaks, data with overlapping peaks, +and concave data, and they serve as a way to quickly filter out algorithms that would not +work for a particular dataset. + +Refer to the :doc:`API section <../api/index>` of the documentation for the full parameter and reference listing for any algorithm. diff --git a/docs/algorithms/polynomial.rst b/docs/algorithms/polynomial.rst index e57e706..75d2961 100644 --- a/docs/algorithms/polynomial.rst +++ b/docs/algorithms/polynomial.rst @@ -35,7 +35,7 @@ thresholding, or 3) penalyzing outliers. Selective Masking ~~~~~~~~~~~~~~~~~ -Selective masking is the oldest and most basic of the techniques. There +Selective masking is the simplest of the techniques. There are two ways to use selective masking in pybaselines. First, the input dataset can be trimmed/masked (easy to do with numpy) to not diff --git a/docs/algorithms/whittaker.rst b/docs/algorithms/whittaker.rst index 2ef9096..f1ed4d6 100644 --- a/docs/algorithms/whittaker.rst +++ b/docs/algorithms/whittaker.rst @@ -8,13 +8,15 @@ algorithms for fitting the baseline. Introduction ------------ -Whittaker-smoothing-based (WSB) algorithms are usually referred to in literature +Whittaker-smoothing-based algorithms are usually referred to in literature as weighted least squares, penalized least squares, or asymmetric least squares, -but are referred to as WSB in pybaselines to distinguish them from polynomial +but are referred to as Whittaker-smoothing-based in pybaselines to distinguish them from polynomial techniques that also take advantage of weighted least squares (like :meth:`~.Baseline.loess`) and penalized least squares (like :meth:`~.Baseline.penalized_poly`). -The general idea behind WSB algorithms is to make the baseline match the measured +A great introduction to Whittaker smoothing is Paul Eilers's +`A Perfect Smoother paper `_. The general idea behind Whittaker +smoothing algorithms is to make the baseline match the measured data as well as it can while also penalizing the roughness of the baseline. The resulting general function that is minimized to determine the baseline is then @@ -57,21 +59,21 @@ and :math:`D_2` (second order difference matrix) is: 0 & 0 & 1 & -2 & 1 \\ \end{bmatrix} -Most WSB techniques recommend using the second order difference matrix, although -some techniques use both the first and second order difference matrices. +Most Whittaker-smoothing-based techniques recommend using the second order difference matrix, +although some techniques use both the first and second order difference matrices. The baseline is iteratively calculated using the linear system above by solving for the baseline, :math:`z`, updating the weights, solving for the baseline using the new weights, and repeating until some exit criteria. -The difference between WSB algorithms is the selection of weights and/or the -function that is minimized. +The difference between Whittaker-smoothing-based algorithms is the selection of weights +and/or the function that is minimized. .. note:: - The :math:`\lambda` (``lam``) value required to fit a particular baseline for all WSB - methods will increase as the number of data points increases, with the relationship - being roughly :math:`\log(\lambda) \propto \log(\text{number of data points})`. For example, - a ``lam`` value of :math:`10^3` that fits a dataset with 100 points may have to be :math:`10^7` - to fit the same data with 1000 points, and :math:`10^{11}` for 10000 points. + The :math:`\lambda` (``lam``) value required to fit a particular baseline for all + Whittaker-smoothing-based methods will increase as the number of data points increases, with + the relationship being roughly :math:`\log(\lambda) \propto \log(\text{number of data points})`. + For example, a ``lam`` value of :math:`10^3` that fits a dataset with 100 points may have to + be :math:`10^7` to fit the same data with 1000 points, and :math:`10^{11}` for 10000 points. Algorithms diff --git a/docs/algorithms_2d/index.rst b/docs/algorithms_2d/index.rst new file mode 100644 index 0000000..8720c68 --- /dev/null +++ b/docs/algorithms_2d/index.rst @@ -0,0 +1,26 @@ +============= +2D Algorithms +============= + +pybaselines extends a subset of the 1D baseline correction algorithms to work with +2D data. Note that this is only intended for data in which there is some global baseline; +otherwise, it is more appropriate and usually significantly faster to simply use the 1D +algorithms on each individual row and/or column in the data. + +This section of the documentation is to help provide some context for how the algorithms +were extended to work with two dimensional data. It will not be as comprehensive as the +:doc:`1D Algorithms section <../algorithms/index>`, so to help understand any algorithm, +it is suggested to start there. Refer to the :doc:`API section <../api/index>` of the +documentation for the full parameter and reference listing for any algorithm. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + polynomial_2d + whittaker_2d + morphological_2d + spline_2d + smooth_2d + optimizers_2d diff --git a/docs/algorithms_2d/morphological_2d.rst b/docs/algorithms_2d/morphological_2d.rst new file mode 100644 index 0000000..c150a33 --- /dev/null +++ b/docs/algorithms_2d/morphological_2d.rst @@ -0,0 +1,123 @@ +======================= +Morphological Baselines +======================= + +.. note:: + All morphological algorithms use a ``half_window`` parameter to define the size + of the window used for the morphological operators. ``half_window`` is index-based, + rather than based on the units of the data, so proper conversions must be done + by the user to get the desired window size. + + +Algorithms +---------- + +mor (Morphological) +~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.mor`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.mor(y, half_window=(6, 4)) + create_plots(y, baseline) + + +imor (Improved Morphological) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.imor`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.imor(y, half_window=(4, 2), tol=5e-3) + create_plots(y, baseline) + + +rolling_ball (Rolling Ball) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.rolling_ball`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.rolling_ball(y, half_window=(8, 5), smooth_half_window=3) + create_plots(y, baseline) + + +tophat (Top-hat Transformation) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.tophat`: +:ref:`explanation for the algorithm `. + + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.tophat(y, half_window=(8, 5)) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/optimizers_2d.rst b/docs/algorithms_2d/optimizers_2d.rst new file mode 100644 index 0000000..84be92f --- /dev/null +++ b/docs/algorithms_2d/optimizers_2d.rst @@ -0,0 +1,77 @@ +=================== +Optimizer Baselines +=================== + +Algorithms +---------- + +collab_pls (Collaborative Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.collab_pls`: +:ref:`explanation for the algorithm `. +There is no figure showing a fit for for this method since it requires multiple sets of data. + +adaptive_minmax (Adaptive MinMax) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.adaptive_minmax`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.adaptive_minmax(y, poly_order=(2, 3)) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/polynomial_2d.rst b/docs/algorithms_2d/polynomial_2d.rst new file mode 100644 index 0000000..1c60bff --- /dev/null +++ b/docs/algorithms_2d/polynomial_2d.rst @@ -0,0 +1,159 @@ +==================== +Polynomial Baselines +==================== + +Introduction +------------ + +In 2D, a polynomial can be expressed as + +.. math:: + + p(x, z) = \sum\limits_{i = 0}^{d_r} \sum\limits_{j = 0}^{d_c} {\beta_{i, j} x^i z^j} + +where :math:`\beta` is the matrix of coefficients for the polynomial and :math:`d_r` +and :math:`d_c` are the polynomial degrees for the rows (:math:`x`) and +columns (:math:`z`), respectively. + +For regular polynomial fitting, the polynomial coefficients that best fit data +are gotten from minimizing the least-squares: + +.. math:: \sum\limits_{i}^M \sum\limits_{j}^N w_{ij}^2 (y_{ij} - p(x_i, z_j))^2 + +where :math:`y_{ij}`, :math:`x_i`, and :math:`z_j` are the measured data, :math:`p(x_i, z_j)` is +the polynomial estimate at :math:`x_i`, and :math:`z_j` and :math:`w_{ij}` is the weighting. + + +However, since only the baseline of the data is desired, the least-squares +approach must be modified. For polynomial-based algorithms, this is done +by 1) only fitting the data in regions where there is only baseline, 2) +modifying the y-values being fit each iteration, or 3) penalyzing outliers. + +.. note:: + For two dimensional data, polynomial algorithms take a single ``poly_order`` + parameter that can either be a single number, in which case both the rows and columns + will use the same polynomial degree, ie. :math:`d_r = d_c`, or a sequence + of two numbers (:math:`d_r`, :math:`d_c`) to use different polynomials along + the rows and columns. Further, ``max_cross`` can be set to limit the polynomial + coefficients for the cross terms. + +Algorithms +---------- + +poly (Regular Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.poly`: +:ref:`explanation for the algorithm `. No plot +will be shown since it is just a simple least-squares polynomial fitting. + + +modpoly (Modified Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.modpoly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.modpoly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + +imodpoly (Improved Modified Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.imodpoly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.imodpoly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + + +penalized_poly (Penalized Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.penalized_poly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.penalized_poly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + + +quant_reg (Quantile Regression) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.quant_reg`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.quant_reg( + y, poly_order=(1, 2), max_cross=0, quantile=0.3 + ) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/smooth_2d.rst b/docs/algorithms_2d/smooth_2d.rst new file mode 100644 index 0000000..953211d --- /dev/null +++ b/docs/algorithms_2d/smooth_2d.rst @@ -0,0 +1,76 @@ +=================== +Smoothing Baselines +=================== + +.. note:: + The window size used for smoothing-based algorithms is index-based, rather + than based on the units of the data, so proper conversions must be done + by the user to get the desired window size. + + +Algorithms +---------- + +noise_median (Noise Median method) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.noise_median`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.noise_median(y, half_window=12, smooth_half_window=5) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/spline_2d.rst b/docs/algorithms_2d/spline_2d.rst new file mode 100644 index 0000000..cb34386 --- /dev/null +++ b/docs/algorithms_2d/spline_2d.rst @@ -0,0 +1,182 @@ +================ +Spline Baselines +================ + +Introduction +------------ + +The two dimensional extension of penalized splines (P-splines) for baseline correction +within pybaselines follows the framework of Eilers, Currie, and Durbán +from `[1] `_. The exact equations will be +omitted here (those interested should read the paper, it is very good), but the end result +is that the normal equation for solving the penalized system can be expressed as a +`generalized linear array model `_ +which allows directly using the matrices of the measured data, :math:`Y`, and the weights, +:math:`W`, rather than flattening them, which significantly reduces the required +memory and computation time. + + +Algorithms +---------- + +mixture_model (Mixture Model) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.mixture_model`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.mixture_model(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +irsqr (Iterative Reweighted Spline Quantile Regression) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.irsqr`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.irsqr(y, lam=(1e3, 1e2), quantile=0.3) + create_plots(y, baseline) + + +pspline_asls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_asls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_asls(y, lam=(1e3, 1e0), p=0.005) + create_plots(y, baseline) + + +pspline_iasls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_iasls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_iasls(y, lam=(1e2, 1e-2)) + create_plots(y, baseline) + + +pspline_airpls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_airpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_airpls(y, lam=(1e3, 1e-1)) + create_plots(y, baseline) + + +pspline_arpls (Penalized Spline Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_arpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_arpls(y, lam=(1e3, 5e0)) + create_plots(y, baseline) + + +pspline_iarpls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_iarpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_iarpls(y, lam=(1e2, 1e0)) + create_plots(y, baseline) + + +pspline_psalsa (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_psalsa`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_psalsa(y, lam=(1e3, 5e0), k=0.5) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst new file mode 100644 index 0000000..f0a2da1 --- /dev/null +++ b/docs/algorithms_2d/whittaker_2d.rst @@ -0,0 +1,228 @@ +=================== +Whittaker Baselines +=================== + +Introduction +------------ + +Excellent introductory papers on two dimensional penalized least squares are +`[1] `_ and +`[2] `_. Whittaker-smoothing-based +algorithms are extended to two dimensional data as follows: + +Let the number of rows be :math:`M` and the number of columns :math:`N` within the matrix +of measured data :math:`Y`. Note that :math:`y` is the flattened array of matrix :math:`Y` +with length :math:`M * N`. Analogous to the 1D case, the goal is to make the baseline match +the measured data as well as it can while also penalizing the roughness of the baseline, resulting +in the following minimization: + +.. math:: + + \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - V_{ij})^2 + + \lambda_r \sum\limits_{i}^{M - d_r} (V_{i\bullet} \Delta^{d_r})^2 + + \lambda_c \sum\limits_{j}^{N - d_c} (\Delta^{d_c} V_{j\bullet})^2 + +where :math:`Y_{ij}` is the measured data, :math:`V_{ij}` is the estimated baseline, +:math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the penalty along the columns, +:math:`W_{ij}` is the weighting, :math:`\Delta^{d_r}` is the finite-difference operator of order +:math:`d_r` along each row of :math:`V`, :math:`V_{i\bullet}`, and :math:`\Delta^{d_c}` is the +finite-difference operator of order :math:`d_c` along each column of :math:`V`, :math:`V_{j\bullet}`. + +The resulting linear equation for solving the above minimization is: + +.. math:: + + (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_M) v = w y + + +where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix +version of :math:`\Delta^d`, as already explained for the :ref:`1D case `. +Further, :math:`\otimes` denotes the `Kronecker product `_, +and :math:`I_M` and :math:`I_N` are the identity matrices of length :math:`M` and :math:`N`, respectively. +After solving, the array :math:`v` can then be reshaped into the matrix :math:`V`. + +Since the analytical solution for 2D requires matrices of shape :math:`(M*N, M*N)`, it is quite +memory and computationally expensive to solve. Although the left hand side of the equation is +still sparse and symmetric, it cannot be solved as easily compared to the 1D case since the +bandwidth is no longer small due to the penalties along both the rows and columns (plus the +sparse solver currently available in scipy cannot make use of the symmetric nature of the matrix). + + +.. note:: + For two dimensional data, Whittaker-smoothing-based algorithms take a single ``lam`` + parameter that can either be a single number, in which case both the rows and columns + will use the same smoothing parameter, ie. :math:`\lambda_r = \lambda_c`, or a sequence + of two numbers (:math:`\lambda_r`, :math:`\lambda_c`) + to penalize the rows and columns with different values. + +Algorithms +---------- + +asls (Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.asls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.asls(y, lam=(1e2, 1e1), p=0.001) + create_plots(y, baseline) + + +iasls (Improved Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.iasls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.iasls(y, lam=(1e3, 1e0)) + create_plots(y, baseline) + + +airpls (Adaptive Iteratively Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.airpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.airpls(y, lam=(1e3, 1e1)) + create_plots(y, baseline) + + +arpls (Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.arpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.arpls(y, lam=(1e4, 1e2)) + create_plots(y, baseline) + + +drpls (Doubly Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.drpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.drpls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.iarpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.iarpls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +aspls (Adaptive Smoothness Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.aspls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.aspls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.psalsa`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.psalsa(y, lam=(1e3, 1e2), k=0.5) + create_plots(y, baseline) diff --git a/docs/index.rst b/docs/index.rst index 80c3737..6718773 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,7 @@ pybaselines is a library of algorithms for the baseline correction of experiment installation quickstart algorithms/index + algorithms_2d/index examples/index api/index contributing diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 52f7f5a..a9de87c 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -425,7 +425,7 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di See Also -------- - pybaselines.whittaker.asls + Baseline.asls References ---------- @@ -515,7 +515,7 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, See Also -------- - pybaselines.whittaker.iasls + Baseline.iasls References ---------- @@ -615,7 +615,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, See Also -------- - pybaselines.whittaker.airpls + Baseline.airpls References ---------- @@ -719,7 +719,7 @@ def pspline_arpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde See Also -------- - pybaselines.whittaker.arpls + Baseline.arpls References ---------- @@ -802,7 +802,7 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, See Also -------- - pybaselines.whittaker.drpls + Baseline.drpls References ---------- @@ -909,7 +909,7 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord See Also -------- - pybaselines.whittaker.iarpls + Baseline.iarpls References ---------- @@ -1002,7 +1002,7 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde See Also -------- - pybaselines.whittaker.aspls + Baseline.aspls Notes ----- @@ -1116,7 +1116,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg See Also -------- - pybaselines.whittaker.psalsa + Baseline.psalsa References ---------- @@ -1221,7 +1221,7 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline See Also -------- - pybaselines.whittaker.derpsalsa + Baseline.derpsalsa References ---------- @@ -1346,6 +1346,10 @@ def pspline_mpls(self, data, half_window=None, lam=1e3, p=0.0, num_knots=100, sp ValueError Raised if p is not between 0 and 1. + See Also + -------- + Baseline.mpls + References ---------- .. [32] Li, Zhong, et al. Morphological weighted penalized least squares for diff --git a/pybaselines/two_d/__init__.py b/pybaselines/two_d/__init__.py index fe19827..d2f0c51 100644 --- a/pybaselines/two_d/__init__.py +++ b/pybaselines/two_d/__init__.py @@ -13,7 +13,6 @@ * imodpoly (Improved Modified Polynomial) * penalized_poly (Penalized Polynomial) * quant_reg (Quantile Regression) - * goldindec (Goldindec Method) * Whittaker-smoothing-based methods (:mod:`pybaselines.two_d.whittaker`) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 599b2c9..fa94ae2 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -349,7 +349,7 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif See Also -------- - pybaselines.whittaker.asls + Baseline2D.asls References ---------- @@ -439,7 +439,7 @@ def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, See Also -------- - pybaselines.whittaker.iasls + Baseline2D.iasls References ---------- @@ -533,7 +533,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, See Also -------- - pybaselines.whittaker.airpls + Baseline2D.airpls References ---------- @@ -637,7 +637,7 @@ def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order See Also -------- - pybaselines.whittaker.arpls + Baseline2D.arpls References ---------- @@ -714,7 +714,7 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde See Also -------- - pybaselines.whittaker.iarpls + Baseline2D.iarpls References ---------- @@ -816,7 +816,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr See Also -------- - pybaselines.whittaker.psalsa + Baseline2D.psalsa References ---------- diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 60cf53f..65bf1f2 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -112,10 +112,22 @@ def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_ Returns ------- - numpy.ndarray - The Gaussian distribution evaluated with x. + numpy.ndarray, shape (M, N) + The Gaussian distribution evaluated with x and z. + + Raises + ------ + ValueError + Raised if the input `x` or `z` are not two dimensional. + + Notes + ----- + The input `x` and `z` should be two dimensional arrays, which can be gotten + from their one dimensional counterparts by using :func:`numpy.meshgrid`. """ + if x.ndim != 2 or z.ndim != 2: + raise ValueError('x and z should be two dimensional') return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z) @@ -728,8 +740,8 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, Parameters ---------- - data : array-like, shape (N,) - The measured data values. + data : array-like + The measured data values. Can be one or two dimensional. increment : int, optional The step size for iterating half windows. Default is 1. max_hits : int, optional @@ -747,8 +759,10 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, Returns ------- - half_window : int - The optimized half window size. + half_window : int or numpy.ndarray[int, int] + The optimized half window size(s). If `data` is one dimensional, the + output is a single integer, and if `data` is two dimensional, the output + is an array of two integers. Notes ----- diff --git a/tests/test_utils.py b/tests/test_utils.py index df663c5..1b6808c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -21,7 +21,13 @@ @pytest.fixture(scope='module') def _x_data(): """x-values for testing.""" - return np.linspace(-20, 20) + return np.linspace(-20, 20, 50) + + +@pytest.fixture(scope='module') +def _z_data(): + """z-values for testing.""" + return np.linspace(-10, 10, 30) @pytest.mark.parametrize('sigma', [0.1, 1, 10]) @@ -68,6 +74,33 @@ def test_gaussian_kernel_0_windowsize(data_fixture): assert_array_equal(y, out) +@pytest.mark.parametrize('sigma_x', [0.1, 1, 10]) +@pytest.mark.parametrize('center_x', [-10, 0, 10]) +@pytest.mark.parametrize('sigma_z', [0.1, 1, 10]) +@pytest.mark.parametrize('center_z', [-10, 0, 10]) +@pytest.mark.parametrize('height', [0.1, 1, 10]) +def test_gaussian2d(_x_data, _z_data, height, center_x, center_z, sigma_x, sigma_z): + """Ensures that gaussian2d function in pybaselines.utils is correct.""" + X, Z = np.meshgrid(_x_data, _z_data) + + expected = height * gaussian(X, 1, center_x, sigma_x) * gaussian(Z, 1, center_z, sigma_z) + assert_allclose( + utils.gaussian2d(X, Z, height, center_x, center_z, sigma_x, sigma_z), + expected, 1e-12, 1e-12 + ) + + +def test_gaussian2d_1d_raises(_x_data, _z_data): + """Ensures that gaussian2d function raises an error if the input is one dimensional.""" + X, Z = np.meshgrid(_x_data, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(_x_data, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(X, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(_x_data, Z) + + @pytest.mark.parametrize('sign', (1, -1)) def test_relative_difference_scalar(sign): """Tests relative_difference to ensure it uses abs for scalars.""" From 587afec5003dccd2abdd10a2ea69d2b5c0bb610e Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 4 Feb 2024 17:52:09 -0500 Subject: [PATCH 41/56] MAINT: Ignore x order for weight inputs test The check fails only for a meta test, but not worth keeping. --- tests/conftest.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index efec38a..e4ccf45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -508,9 +508,9 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): weights = np.random.RandomState(0).normal(0.8, 0.05, self.y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) - if hasattr(self, 'two_d'): + if hasattr(self, 'two_d'): # BaseTester reverse_fitter = self.algorithm_base(self.x[::-1], assume_sorted=False) - else: + else: # BaseTester2D reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) weights = weights.reshape(self.y.shape) @@ -522,9 +522,6 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): **self.kwargs, **kwargs ) - # sanity check, x should always be sorted correctly - assert_allclose(reverse_fitter.x, self.x, rtol=1e-14, atol=1e-14) - if assertion_kwargs is None: assertion_kwargs = {} if 'rtol' not in assertion_kwargs: From 60ca5eb733cf7a50c53d66034cb9cb1a3c0e287b Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 8 Feb 2024 19:32:45 -0500 Subject: [PATCH 42/56] MAINT: Add _get_method helper method to Baseline and Baseline2D --- pybaselines/api.py | 29 +++++++++++++++++++++++++++++ pybaselines/two_d/api.py | 29 +++++++++++++++++++++++++++++ tests/test_api.py | 20 +++++++++++++++++++- tests/two_d/test_api.py | 24 +++++++++++++++++++++++- 4 files changed, 100 insertions(+), 2 deletions(-) diff --git a/pybaselines/api.py b/pybaselines/api.py index 025916f..1c5cdea 100644 --- a/pybaselines/api.py +++ b/pybaselines/api.py @@ -61,3 +61,32 @@ class Baseline( set to numpy.ndarray([-1, 1]). """ + + def _get_method(self, method): + """ + A helper function to allow accessing methods by their string. + + Parameters + ---------- + method : str + The name of the desired method as a string. Capitalization is ignored. For + example, both 'asls' and 'AsLS' would return :meth:`~.Baseline.asls`. + + Returns + ------- + output : Callable + The callable method corresponding to the input string. + + Raises + ------ + AttributeError + Raised if the input method does not exist. + + """ + method_string = method.lower() + if hasattr(self, method_string): + output = getattr(self, method_string) + else: + raise AttributeError(f'unknown method "{method}"') + + return output diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py index c59374c..a48354e 100644 --- a/pybaselines/two_d/api.py +++ b/pybaselines/two_d/api.py @@ -71,3 +71,32 @@ class Baseline2D( set to numpy.ndarray([-1, 1]). """ + + def _get_method(self, method): + """ + A helper function to allow accessing methods by their string. + + Parameters + ---------- + method : str + The name of the desired method as a string. Capitalization is ignored. For + example, both 'asls' and 'AsLS' would return :meth:`~.Baseline2D.asls`. + + Returns + ------- + output : Callable + The callable method corresponding to the input string. + + Raises + ------ + AttributeError + Raised if the input method does not exist. + + """ + method_string = method.lower() + if hasattr(self, method_string): + output = getattr(self, method_string) + else: + raise AttributeError(f'unknown method "{method}"') + + return output diff --git a/tests/test_api.py b/tests/test_api.py index b76e1b2..356afc0 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -46,7 +46,11 @@ def get_public_methods(klass): """ methods = [] for method in dir(klass): - if not (method.startswith('_') or method.startswith('pentapy_solver')): + if ( + not (method.startswith('_') + or method.startswith('pentapy_solver') + or method.startswith('get_method')) + ): methods.append(method) return methods @@ -147,3 +151,17 @@ def test_pentapy_solver(self): fitter.pentapy_solver = 3 assert fitter.whittaker_system.pentapy_solver == fitter.pentapy_solver + + def test_get_method(self): + """Ensures the get_method helper function works as intended.""" + method = self.algorithm._get_method('asls') + assert method == self.algorithm.asls + + # also ensure capitalization does not matter + method2 = self.algorithm._get_method('AsLS') + assert method2 == self.algorithm.asls + + def test_get_method_fails(self): + """Ensures the get_method helper function fails when an incorrect name is given.""" + with pytest.raises(AttributeError): + self.algorithm._get_method('aaaaaaaaaaaaa') diff --git a/tests/two_d/test_api.py b/tests/two_d/test_api.py index 63aebc5..d95f2e8 100644 --- a/tests/two_d/test_api.py +++ b/tests/two_d/test_api.py @@ -42,7 +42,15 @@ def get_public_methods(klass): The list of all public methods of the input class. """ - return [method for method in dir(klass) if not method.startswith('_')] + methods = [] + for method in dir(klass): + if ( + not (method.startswith('_') + or method.startswith('pentapy_solver') + or method.startswith('get_method')) + ): + methods.append(method) + return methods # will be like [('asls', whittaker._Whittaker), ('modpoly', polynomial._Polynomial), ...] @@ -129,3 +137,17 @@ def test_method_availability(self): # no additional methods should be available assert len(total_methods) == 0 + + def test_get_method(self): + """Ensures the get_method helper function works as intended.""" + method = self.algorithm._get_method('asls') + assert method == self.algorithm.asls + + # also ensure capitalization does not matter + method2 = self.algorithm._get_method('AsLS') + assert method2 == self.algorithm.asls + + def test_get_method_fails(self): + """Ensures the get_method helper function fails when an incorrect name is given.""" + with pytest.raises(AttributeError): + self.algorithm._get_method('aaaaaaaaaaaaa') From 523749e6230e9e5e4526cfe7eb542e93b8f4a627 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 8 Feb 2024 19:40:24 -0500 Subject: [PATCH 43/56] MAINT: Raise an error when 1d data is used for Baseline2D The exception that was raised was indirectly because of 1d input, so made the error easier to trace. Also no longer allow values less than or equal to 0 in gaussian. --- pybaselines/_validation.py | 29 ++++++---- pybaselines/two_d/_algorithm_setup.py | 3 +- pybaselines/utils.py | 9 ++- tests/test_utils.py | 6 ++ tests/two_d/test_algorithm_setup.py | 79 +++++++++++++++++++++++++++ 5 files changed, 113 insertions(+), 13 deletions(-) diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index 10dd131..0018feb 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -113,7 +113,7 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True, - ensure_2d=False): + ensure_2d=False, two_d=False): """ Validates the shape and values of the input array and controls the output parameters. @@ -162,21 +162,27 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr output = output.reshape(-1) elif dimensions != 1: raise ValueError('must be a one dimensional array') - elif ensure_2d: + elif two_d: output = np.array(output, copy=False, ndmin=2) dimensions = output.ndim - if dimensions == 3 and 1 in output.shape: - output_shape = np.array(output.shape) - flat_dims = ~np.equal(output_shape, 1) - output = output.reshape(output_shape[flat_dims]).shape - elif dimensions != 2: - raise ValueError('must be a two dimensional array') + if dimensions == 2 and 1 in output.shape: + raise ValueError( + 'input data must be a two dimensional array with more than just one row or column' + ) + if ensure_2d: + if dimensions == 3 and 1 in output.shape: + output_shape = np.array(output.shape) + flat_dims = ~np.equal(output_shape, 1) + output = output.reshape(output_shape[flat_dims]).shape + elif dimensions != 2: + raise ValueError('must be a two dimensional array') return output def _check_sized_array(array, length, dtype=None, order=None, check_finite=False, - ensure_1d=True, axis=-1, name='weights'): + ensure_1d=True, axis=-1, name='weights', ensure_2d=False, + two_d=False): """ Validates the input array and ensures its length is correct. @@ -214,7 +220,8 @@ def _check_sized_array(array, length, dtype=None, order=None, check_finite=False """ output = _check_array( - array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d + array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d, + ensure_2d=ensure_2d, two_d=two_d ) if not np.equal(output.shape[axis], length).all(): raise ValueError( @@ -321,7 +328,7 @@ def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, """ y = _check_array( data, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=False, - ensure_2d=ensure_2d + ensure_2d=ensure_2d, two_d=True ) x_len = y.shape[x_axis] z_len = y.shape[z_axis] diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 5f536a5..e4f208f 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -276,7 +276,8 @@ def inner(self, data=None, *args, **kwargs): axis = -1 y = _check_sized_array( data, expected_shape, check_finite=self._check_finite, dtype=dtype, - order=order, ensure_1d=False, axis=axis, name='data' + order=order, ensure_1d=False, axis=axis, name='data', ensure_2d=ensure_2d, + two_d=True ) else: y, self.x, self.z = _yxz_arrays( diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 65bf1f2..ae61a1a 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -85,8 +85,15 @@ def gaussian(x, height=1.0, center=0.0, sigma=1.0): numpy.ndarray The Gaussian distribution evaluated with x. + Raises + ------ + ValueError + Raised if `sigma` is not greater than 0. + """ - return height * np.exp(-0.5 * ((x - center)**2) / max(sigma, _MIN_FLOAT)**2) + if sigma <= 0: + raise ValueError('sigma must be greater than 0') + return height * np.exp(-0.5 * ((x - center)**2) / sigma**2) def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0): diff --git a/tests/test_utils.py b/tests/test_utils.py index 1b6808c..d5f06d5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -40,6 +40,12 @@ def test_gaussian(_x_data, height, center, sigma): gaussian(_x_data, height, center, sigma), 1e-12, 1e-12 ) +@pytest.mark.parametrize('sigma', (0, -1)) +def test_gaussian_non_positive_sigma(_x_data, sigma): + """Ensures a sigma value not greater than 0 raises an exception.""" + with pytest.raises(ValueError): + utils.gaussian(_x_data, sigma=sigma) + @pytest.mark.parametrize('window_size', (1, 20, 100)) @pytest.mark.parametrize('sigma', (1, 2, 5)) diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 80072e3..3da7060 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -719,6 +719,85 @@ def func4(self, data, *args, **kwargs): assert_array_equal(value, output_params4[key], err_msg=f'{key} failed') +def test_algorithm_register_no_data_fails(): + """Ensures an error is raised if the input data is None.""" + + class SubClass(_algorithm_setup._Algorithm2D): + + @_algorithm_setup._Algorithm2D._register + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._Algorithm2D._register() + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func() + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func2() + + +def test_algorithm_register_1d_fails(data_fixture): + """Ensures an error is raised if 1D data is used for 2D algorithms.""" + + class SubClass(_algorithm_setup._Algorithm2D): + + @_algorithm_setup._Algorithm2D._register + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._Algorithm2D._register() + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + x, y = data_fixture + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # also test when given x values + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # and when y is 2D but only has one row + y_2d = np.atleast_2d(y) + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + # and when y is 2D but only has one column + y_2d_transposed = np.atleast_2d(y).T + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + algorithm = SubClass(x) # x now correspond to the rows in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + def test_override_x(algorithm): """Ensures the `override_x` method correctly initializes with the new x values.""" new_len = 20 From f2a351eb653fba15c4ef358b67451f8138ada9a6 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:13:56 -0500 Subject: [PATCH 44/56] ENH: Allow eigendecomposition for most 2D whittaker algorithms Using eigendecomposition to solve 2D whittaker baselines reduces the computation time significantly, and the computation time scales relatively linear with data size since the number of eigenvalues depends only on baseline curvature and does not increase with size. Need to add some tests and explanations in docstrings and the main docs about the eigendecomposition. Also renamed solve_pspline to just solve. --- pybaselines/_banded_utils.py | 50 ++- pybaselines/spline.py | 4 +- pybaselines/two_d/_algorithm_setup.py | 19 +- pybaselines/two_d/_spline_utils.py | 30 +- pybaselines/two_d/_whittaker_utils.py | 534 ++++++++++++++++++-------- pybaselines/two_d/spline.py | 22 +- pybaselines/two_d/whittaker.py | 165 +++++--- pybaselines/whittaker.py | 4 +- tests/test_banded_utils.py | 31 ++ tests/two_d/test_spline.py | 11 +- tests/two_d/test_spline_utils.py | 6 +- tests/two_d/test_whittaker_utils.py | 98 ++--- 12 files changed, 625 insertions(+), 349 deletions(-) diff --git a/pybaselines/_banded_utils.py b/pybaselines/_banded_utils.py index 74520fe..a2a5f43 100644 --- a/pybaselines/_banded_utils.py +++ b/pybaselines/_banded_utils.py @@ -8,7 +8,7 @@ import numpy as np from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import identity, diags +from scipy.sparse import identity, diags, spdiags from ._compat import _HAS_PENTAPY, _pentapy_solve from ._validation import _check_lam @@ -484,6 +484,54 @@ def diff_penalty_diagonals(data_size, diff_order=2, lower_only=True, padding=0): return diagonals +def diff_penalty_matrix(data_size, diff_order=2, diff_format='csr'): + """ + Creates the finite difference penalty matrix. + + If `D` is the finite difference matrix, then the finite difference penalty + matrix is defined as ``D.T @ D``. + + Parameters + ---------- + data_size : int + The number of data points. + diff_order : int, optional + The integer differential order; must be >= 0. Default is 2. + diff_format : str or None, optional + The sparse format to use for the difference matrix. Default is 'csr'. + + Returns + ------- + penalty_matrix : scipy.sparse.base.spmatrix + The sparse difference penalty matrix. + + Raises + ------ + ValueError + Raised if `diff_order` is greater or equal to `data_size`. + + Notes + ----- + Equivalent to calling:: + + from pybaselines.utils import difference_matrix + diff_matrix = difference_matrix(data_size, diff_order) + penalty_matrix = diff_matrix.T @ diff_matrix + + but should be faster since the bands within the penalty matrix can be gotten + without the matrix multiplication. + + """ + if data_size <= diff_order: + raise ValueError('data size must be greater than or equal to the difference order.') + penalty_bands = diff_penalty_diagonals(data_size, diff_order, lower_only=False) + penalty_matrix = spdiags( + penalty_bands, np.arange(diff_order, -diff_order - 1, -1), data_size, data_size, + format=diff_format + ) + return penalty_matrix + + def _pentapy_solver(ab, y, check_output=False, pentapy_solver=2): """ Convenience function for calling pentapy's solver with defaults already set. diff --git a/pybaselines/spline.py b/pybaselines/spline.py index a9de87c..59a9e87 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -972,7 +972,7 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -2305,7 +2305,7 @@ def pspline_aspls(data, lam=1e4, num_knots=100, spline_degree=3, diff_order=2, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index e4f208f..ff197bd 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -22,7 +22,7 @@ _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_scalar_variable, _check_sized_array, _yxz_arrays ) -from ._whittaker_utils import PenalizedSystem2D +from ._whittaker_utils import WhittakerSystem2D class _Algorithm2D: @@ -386,7 +386,7 @@ def _override_x(self, new_x, new_sort_order=None): self.pspline = old_pspline def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=False, - use_lower=True, use_banded=False): + eigenvalues=None): """ Sets the starting parameters for doing penalized least squares. @@ -449,15 +449,18 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] - weight_array = weight_array.ravel() - if self.whittaker_system is not None: - self.whittaker_system.reset_diagonals(lam, diff_order, use_banded, use_lower) + + if self.whittaker_system is not None and self.whittaker_system.same_basis(diff_order, eigenvalues): + self.whittaker_system.update_penalty(lam) else: - self.whittaker_system = PenalizedSystem2D( - self._len, lam, diff_order, use_banded, use_lower + self.whittaker_system = WhittakerSystem2D( + self._len, lam, diff_order, eigenvalues ) + if not self.whittaker_system._using_svd: + y = y.ravel() + weight_array = weight_array.ravel() - return y.ravel(), weight_array + return y, weight_array def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, calc_pinv=False, copy_weights=False, max_cross=None): diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 30f2cc7..d23de3f 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -55,6 +55,13 @@ class PSpline2D(PenalizedSystem2D): z : numpy.ndarray, shape (M,) The z-values for the spline. + Notes + ----- + If the penalty is symmetric, the sparse system could be solved much faster using + CHOLMOD from SuiteSparse (https://github.com/DrTimothyAldenDavis/SuiteSparse) through + the python bindings provided by scikit-sparse (https://github.com/scikit-sparse/scikit-sparse), + but it is not worth implementing here since this code will rarely be used. + References ---------- Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational @@ -113,9 +120,8 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam self.knots_c = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) self.basis_c = _spline_basis(self.z, self.knots_c, self.spline_degree[1]) - super().__init__( - (self.basis_r.shape[1], self.basis_c.shape[1]), lam, diff_order, use_banded=False - ) + super().__init__((self.basis_r.shape[1], self.basis_c.shape[1]), lam, diff_order) + if (self.diff_order >= self._num_bases).any(): raise ValueError(( 'the difference order must be less than the number of basis ' @@ -168,22 +174,11 @@ def reset_penalty(self, lam=1, diff_order=2): smoother results. Must be greater than 0. Default is 1. diff_order : int or Sequence[int, int], optional The difference order of the penalty. Default is 2 (second order difference). - allow_lower : bool, optional - If True (default), will allow only using the lower bands of the penalty matrix, - which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly - slower :func:`scipy.linalg.solve_banded`. - reverse_diags : bool, optional - If True, will reverse the order of the diagonals of the squared difference - matrix. If False (default), will never reverse the diagonals. - - Notes - ----- - `use_banded` is always set to False since the banded structure in 2D is not small. """ - self.reset_diagonals(lam, diff_order, use_banded=False) + self.reset_diagonals(lam, diff_order) - def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): + def solve(self, y, weights, penalty=None, rhs_extra=None): """ Solves the coefficients for a weighted penalized spline. @@ -239,8 +234,7 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): if rhs_extra is not None: rhs = rhs + rhs_extra - self.coef = spsolve(F + penalty, rhs, permc_spec='NATURAL') - + self.coef = spsolve(F + penalty, rhs) output = self.basis_r @ self.coef.reshape(self._num_bases) @ self.basis_c.T return output diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 8da0d57..b716005 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -7,88 +7,24 @@ """ import numpy as np -from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import identity, kron, spdiags +from scipy.linalg import eig_banded, eigh_tridiagonal, solve +from scipy.sparse import identity, kron from scipy.sparse.linalg import spsolve -from .._banded_utils import _add_diagonals, diff_penalty_diagonals +from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix from .._validation import _check_lam, _check_scalar -def diff_penalty_matrix(data_size, diff_order=2): - """ - Creates the finite difference penalty matrix. - - If `D` is the finite difference matrix, then the finite difference penalty - matrix is defined as ``D.T @ D``. - - Parameters - ---------- - data_size : int - The number of data points. - diff_order : int, optional - The integer differential order; must be >= 0. Default is 2. - - Returns - ------- - penalty_matrix : scipy.sparse.base.spmatrix - The sparse difference penalty matrix. - - Raises - ------ - ValueError - Raised if `diff_order` is greater or equal to `data_size`. - - Notes - ----- - Equivalent to calling:: - - from pybaselines.utils import difference_matrix - diff_matrix = difference_matrix(data_size, diff_order) - penalty_matrix = diff_matrix.T @ diff_matrix - - but should be faster since the bands within the penalty matrix can be gotten - without the matrix multiplication. - - """ - if data_size <= diff_order: - raise ValueError('data size must be greater than or equal to the difference order.') - penalty_bands = diff_penalty_diagonals(data_size, diff_order, lower_only=False) - penalty_matrix = spdiags( - penalty_bands, np.arange(diff_order, -diff_order - 1, -1), data_size, data_size - ) - return penalty_matrix - - class PenalizedSystem2D: """ An object for setting up and solving penalized least squares linear systems. Attributes ---------- - banded : bool - If True, the penalty is an array of the bands within the sparse matrix. If False, - the penalty is a sparse matrix. - diff_order : numpy.array([int, int]) + diff_order : numpy.ndarray[int, int] The difference order of the penalty. - lower : bool - If True, the penalty uses only the lower bands of the symmetric banded penalty. Will - use :func:`scipy.linalg.solveh_banded` for solving. If False, contains both the upper - and lower bands of the penalty and will use either :func:`scipy.linalg.solve_banded` - (if `using_pentapy` is False) or :func:`._pentapy_solver` when solving. - main_diagonal_index : int - The index of the main diagonal for `penalty`. Is updated when adding additional matrices - to the penalty, and takes into account whether the penalty is only the lower bands or - the total bands. - num_bands : int - The number of bands in the penalty. The number of bands is assumbed to be symmetric, - so the number of upper and lower bands should both be equal to `num_bands`. - original_diagonals : numpy.ndarray - The original penalty diagonals before multiplying by `lam` or adding any padding. - Maintained so that repeated computations with different `lam` values can be quickly - set up. `original_diagonals` can be either the full or lower bands of the penalty, - and may be reveresed, it depends on the set up. Reset by calling - :meth:`~PenalizedSystem2D.reset_diagonals`. + main_diagonal : numpy.ndarray + The values along the main diagonal of the penalty matrix. penalty : scipy.sparse.base.spmatrix The current penalty. Originally is `original_diagonals` after multiplying by `lam` and applying padding, but can also be changed by calling @@ -97,11 +33,10 @@ class PenalizedSystem2D: Notes ----- - Setting up the linear system using banded matrices is faster, but the number of bands is - actually quite large (`data_size[1]`) due to the Kronecker products, although only - ``2 * diff_order[0] + 2 * diff_order[1] + 2`` bands are actually nonzero. Despite this, it is - still significantly faster than using the sparse solver and does not use more memory as - long as it is only lower banded. + If the penalty is symmetric, the sparse system could be solved much faster using + CHOLMOD from SuiteSparse (https://github.com/DrTimothyAldenDavis/SuiteSparse) through + the python bindings provided by scikit-sparse (https://github.com/scikit-sparse/scikit-sparse), + but it is not worth implementing here since this code will rarely be used. References ---------- @@ -110,7 +45,7 @@ class PenalizedSystem2D: """ - def __init__(self, data_size, lam=1, diff_order=2, use_banded=True, use_lower=True): + def __init__(self, data_size, lam=1, diff_order=2): """ Initializes the banded system. @@ -126,20 +61,10 @@ def __init__(self, data_size, lam=1, diff_order=2, use_banded=True, use_lower=Tr The difference order of the penalty for the rows and columns, respectively. If a single value is given, both will use the same value. Default is 2 (second order difference). - use_banded : bool, optional - If True (default), will do the setup for solving the system using banded - matrices rather than sparse matrices. - use_lower : bool, optional - If True (default), will allow only using the lower bands of the penalty matrix, - which allows using :func:`scipy.linalg.solveh_banded` instead of the slightly - slower :func:`scipy.linalg.solve_banded`. Only relevant if `use_banded` is True. """ self._num_bases = data_size - self.diff_order = [-1, -1] - self.lam = [-1, -1] - - self.reset_diagonals(lam, diff_order, use_banded, use_lower) + self.reset_diagonals(lam, diff_order) def add_penalty(self, penalty): """ @@ -156,10 +81,7 @@ def add_penalty(self, penalty): The updated `self.penalty`. """ - if self.banded: - self.penalty = _add_diagonals(self.penalty, penalty, lower_only=self.lower) - else: - self.penalty = self.penalty + penalty + self.penalty = self.penalty + penalty self._update_bands() return self.penalty @@ -171,18 +93,9 @@ def _update_bands(self): Only relevant if setup as a banded matrix. """ - if self.banded: - if self.lower: - self.num_bands = self.penalty.shape[0] - 1 - else: - self.num_bands = self.penalty.shape[0] // 2 - self.main_diagonal_index = 0 if self.lower else self.num_bands - self.main_diagonal = self.penalty[self.main_diagonal_index].copy() - else: - self.main_diagonal_index = 0 - self.main_diagonal = self.penalty.diagonal() + self.main_diagonal = self.penalty.diagonal() - def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): + def reset_diagonals(self, lam=1, diff_order=2): """ Resets the diagonals of the system and all of the attributes. @@ -198,15 +111,10 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): The difference order of the penalty for the rows and columns, respectively. If a single value is given, both will use the same value. Default is 2 (second order difference). - use_banded : bool, optional - If True (default), will do the setup for solving the system using banded - matrices rather than sparse matrices. """ self.diff_order = _check_scalar(diff_order, 2, True)[0] self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - self.lower = use_lower - self.banded = use_banded if (self.diff_order < 1).any(): raise ValueError('the difference order must be > 0') @@ -216,75 +124,47 @@ def reset_diagonals(self, lam=1, diff_order=2, use_banded=True, use_lower=True): # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam P_rows = kron(self.lam[0] * penalty_rows, identity(self._num_bases[1])) P_columns = kron(identity(self._num_bases[0]), self.lam[1] * penalty_columns) - penalty = P_rows + P_columns - if self.banded: - penalty = penalty.todia() - sparse_bands = (penalty).data - offsets = penalty.offsets - index_offset = np.max(offsets) - penalty_bands = np.zeros((index_offset * 2 + 1, sparse_bands.shape[1])) - for index, banded_index in enumerate(offsets): - penalty_bands[abs(banded_index - index_offset)] = sparse_bands[index] - self.penalty = penalty_bands - if self.lower: - self.penalty = self.penalty[self.penalty.shape[0] // 2:] - else: - self.penalty = penalty + self.penalty = P_rows + P_columns self._update_bands() - def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, - check_finite=False, l_and_u=None): + def solve(self, y, weights, penalty=None, rhs_extra=None): """ - Solves the equation ``A @ x = rhs``, given `A` in banded format as `lhs`. + Solves the equation ``A @ x = b``. Parameters ---------- - lhs : array-like, shape (M, N) - The left-hand side of the equation, in banded format. `lhs` is assumed to be - some slight modification of `self.penalty` in the same format (reversed, lower, - number of bands, etc. are all the same). - rhs : array-like, shape (N,) - The right-hand side of the equation. - overwrite_ab : bool, optional - Whether to overwrite `lhs` when using :func:`scipy.linalg.solveh_banded` or - :func:`scipy.linalg.solve_banded`. Default is False. - overwrite_b : bool, optional - Whether to overwrite `rhs` when using :func:`scipy.linalg.solveh_banded` or - :func:`scipy.linalg.solve_banded`. Default is False. - check_finite : bool, optional - Whether to check if the inputs are finite when using - :func:`scipy.linalg.solveh_banded` or :func:`scipy.linalg.solve_banded`. - Default is False. - l_and_u : Container(int, int), optional - The number of lower and upper bands in `lhs` when using - :func:`scipy.linalg.solve_banded`. Default is None, which uses - (``len(lhs) // 2``, ``len(lhs) // 2``). + y : numpy.ndarray + The y-values for fitting the spline. + weights : numpy.ndarray + The weights for each y-value. Will also be added to the diagonal of the + penalty. + penalty : numpy.ndarray + The penalty to use for solving. Default is None which uses the object's + penalty. + rhs_extra : float or numpy.ndarray, optional + If supplied, `rhs_extra` will be added to the right hand side + of the equation before solving. Default is None, which adds nothing. Returns ------- - output : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The solution to the linear system, `x`. """ - if self.banded: - if self.lower: - output = solveh_banded( - lhs, rhs, overwrite_ab=overwrite_ab, - overwrite_b=overwrite_b, lower=True, check_finite=check_finite - ) - else: - if l_and_u is None: - num_bands = len(lhs) // 2 - l_and_u = (num_bands, num_bands) - output = solve_banded( - l_and_u, lhs, rhs, overwrite_ab=overwrite_ab, - overwrite_b=overwrite_b, check_finite=check_finite - ) + if penalty is None: + lhs = self.add_diagonal(weights) else: - output = spsolve(lhs, rhs, permc_spec='NATURAL') + penalty.setdiag(penalty.diagonal() + weights) + lhs = penalty + rhs = weights * y + if rhs_extra is not None: + rhs = rhs + rhs_extra - return output + return self.direct_solve(lhs, rhs) + + def direct_solve(self, lhs, rhs): + return spsolve(lhs, rhs) def add_diagonal(self, value): """ @@ -301,15 +181,333 @@ def add_diagonal(self, value): The penalty matrix with the main diagonal updated. """ - if self.banded: - self.penalty[self.main_diagonal_index] = self.main_diagonal + value - else: - self.penalty.setdiag(self.main_diagonal + value) + self.penalty.setdiag(self.main_diagonal + value) return self.penalty def reset_diagonal(self): """Sets the main diagonal of the penalty matrix back to its original value.""" - if self.banded: - self.penalty[self.main_diagonal_index] = self.main_diagonal + self.penalty.setdiag(self.main_diagonal) + + +class WhittakerSystem2D(PenalizedSystem2D): + """ + Sets up and solves Whittaker smoothing using the analytical solution or eigendecomposition. + + Attributes + ---------- + basis_r : scipy.sparse.csr.csr_matrix, shape (N, P) + The spline basis for the rows. Has a shape of (`N,` `P`), where `N` is the number of + points in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[0] + spline_degree[0] - 1``). + basis_c : scipy.sparse.csr.csr_matrix, shape (M, Q) + The spline basis for the columns. Has a shape of (`M,` `Q`), where `M` is the number of + points in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[1] + spline_degree[1] - 1``). + coef : None or numpy.ndarray, shape (M,) + The spline coefficients. Is None if :meth:`~PSpline2D.solve_pspline` has not been called + at least once. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + + def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): + """ + Initializes the penalized spline by calculating the basis and penalty. + + Parameters + ---------- + data_size : Sequence[int, int] + The number of data points for the system. + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + max_eigens : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for the eigendecomposition. If None, will solve the linear system using the full + analytical solution, which is typically much slower. + + """ + # TODO should figure out a way to better merge PenalizedSystem2D, PSpline2D, and this class + self.coef = None + self._basis = None + self._num_points = data_size + self.diff_order = _check_scalar(diff_order, 2, True)[0] + if max_eigens is None or None in max_eigens: + self._num_bases = data_size + self._using_svd = False else: - self.penalty.setdiag(self.main_diagonal) + # TODO need to check to ensure max_eigens is <= data_size and otherwise emit + # an error; if max_eigens is >~ 40 should emit an error saying too many + self._num_bases = _check_scalar(max_eigens, 2, True, dtype=int)[0] + self._using_svd = True + self.reset_diagonals(lam, diff_order) + + if self._using_svd: + el = np.ones((1, self._num_bases[0])) + ek = np.ones((1, self._num_bases[1])) + self._G_r = kron(self.basis_r, el).multiply(kron(el, self.basis_r)) + self._G_c = kron(self.basis_c, ek).multiply(kron(ek, self.basis_c)) + + def reset_diagonals(self, lam=1, diff_order=2): + """ + Resets the diagonals of the system and all of the attributes. + + Useful for reusing the penalized system for a different `lam` value. + + Parameters + ---------- + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + + """ + if not self._using_svd: + super().reset_diagonals(lam, diff_order) + return + + self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + self.diff_order = _check_scalar(diff_order, 2, True)[0] + if (self.diff_order < 1).any(): + raise ValueError('the difference order must be > 0') + + # initially need num_bases to point to the data shape; maybe set a second + # attribute insteaad + values_rows, vectors_rows = self._calc_eigenvalues( + self._num_points[0], self.diff_order[0], self._num_bases[0] + ) + # TODO if all else matches, just calc the max eigens and use indexing for the lower one + if ( + self.diff_order[0] == self.diff_order[1] + and self._num_points[0] == self._num_points[1] + and self._num_bases[0] == self._num_bases[1] + ): + values_columns, vectors_columns = values_rows, vectors_rows + else: + values_columns, vectors_columns = self._calc_eigenvalues( + self._num_points[1], self.diff_order[1], self._num_bases[1] + ) + # the eigenvalues are a diagonal matrix, so can simplify since + # kron(diagonal, identity(N)) == np.repeat(diagonal, N) and + # kron(identity(M), diaonal2) == np.tile(diagonal2, M) + self.penalty_rows = np.repeat(self.lam[0] * values_rows, self._num_bases[1]) + self.penalty_columns = np.tile(self.lam[1] * values_columns, self._num_bases[0]) + # penalty is a (_num_bases[0] * _num_bases[1],) array + self.penalty = self.penalty_rows + self.penalty_columns + + self.basis_r = vectors_rows + self.basis_c = vectors_columns + + def _calc_eigenvalues(self, data_points, diff_order, num_eigens): + # TODO the lowest diff_order eigenvalues should be zero, while they end up being + # ~ +- 1e-15, will this affect any calculations or can it be left as it? If it does + # need set to 0, do the eigenvectors likewise need updated for that? + penalty_bands = diff_penalty_diagonals(data_points, diff_order, lower_only=True) + if diff_order == 1: + eigenvalues, eigenvectors = eigh_tridiagonal( + penalty_bands[0], penalty_bands[1, :-1], select='i', + select_range=(0, num_eigens - 1) + ) + else: + eigenvalues, eigenvectors = eig_banded( + penalty_bands, lower=True, select='i', + select_range=(0, num_eigens - 1), overwrite_a_band=True + ) + return eigenvalues, eigenvectors + + def update_penalty(self, lam): + if not self._using_svd: + raise ValueError('Must call reset_diagonals if not using eigendecomposition') + lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + self.penalty_rows = (lam[0] / self.lam[0]) * self.penalty_rows + self.penalty_columns = (lam[1] / self.lam[1]) * self.penalty_columns + + self.lam = lam + self.penalty = self.penalty_rows + self.penalty_columns + + def same_basis(self, diff_order=2, max_eigens=None): + """ + Sees if the current basis is equivalent to the input number of eigenvalues and diff order. + + Always returns False if the previous setup did not use eigendecomposition or if + the input maximum number of eigenvalues is None. + + Parameters + ---------- + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + max_eigens : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for the eigendecomposition. If None, will solve the linear system using the full + analytical solution, which is typically much slower. + + Returns + ------- + bool + True if the input number of eigenvalues and difference order are equivalent to the + current setup for the object. + + """ + # TODO should give a way to update only one of the basis functions, which + # would also need to update the penalty + if max_eigens is None or not self._using_svd: + return False + + max_eigens = _check_scalar(max_eigens, 2, True)[0] + diff_order = _check_scalar(diff_order, 2, True)[0] + return ( + np.array_equal(diff_order, self.diff_order) + and np.array_equal(max_eigens, self._num_bases) + ) + + def reset_penalty(self, lam=1, diff_order=2): + """ + Resets the penalty of the system and all of the attributes. + + Useful for reusing the penalty diagonals without having to recalculate the spline basis. + + Parameters + ---------- + lam : float or Sequence[float, float], optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty. Default is 2 (second order difference). + + """ + # TODO is this even needed? + self.reset_diagonals(lam, diff_order) + + def _make_btwb(self, weights): + """Computes ``Basis.T @ Weights @ Basis`` using a more efficient method. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + # do not save intermediate results since they are memory intensive for high number of bases + # note to self: F is not sparse when the basis functions are eigenvectors since the + # eigenvector matrices are fully dense; it is however symmetric and positive definite + F = np.transpose( + (self._G_r.T @ weights @ self._G_c).reshape( + (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) + ), + [0, 2, 1, 3] + ).reshape( + (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + ) + + return F + + def solve(self, y, weights, penalty=None, rhs_extra=None, assume_a='pos'): + """ + Solves the coefficients for a weighted penalized spline. + + Solves the linear equation ``(B.T @ W @ B + P) c = B.T @ W @ y`` for the spline + coefficients, `c`, given the spline basis, `B`, the weights (diagonal of `W`), the + penalty `P`, and `y`, and returns the resulting spline, ``B @ c``. Attempts to + calculate ``B.T @ W @ B`` and ``B.T @ W @ y`` as a banded system to speed up + the calculation. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values for fitting the spline. + weights : numpy.ndarray, shape (M, N) + The weights for each y-value. + penalty : numpy.ndarray, shape (``M * N``, ``M * N``) + The finite difference penalty matrix, in LAPACK's lower banded format (see + :func:`scipy.linalg.solveh_banded`) if `lower_only` is True or the full banded + format (see :func:`scipy.linalg.solve_banded`) if `lower_only` is False. + rhs_extra : float or numpy.ndarray, shape (``M * N``,), optional + If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (M, N) + The spline, corresponding to ``B @ c``, where `c` are the solved spline + coefficients and `B` is the spline basis. + + Notes + ----- + Uses the more efficient algorithm from Eilers's paper, although the memory usage + is higher than the straigtforward method when the number of knots is high; however, + it is significantly faster and memory efficient when the number of knots is lower, + which will be the more typical use case. + + """ + if not self._using_svd: + return super().solve(y, weights, penalty, rhs_extra) + + rhs = (self.basis_r.T @ (weights * y) @ self.basis_c).ravel() + if rhs_extra is not None: + rhs = rhs + rhs_extra + + if penalty is None: + penalty = self.penalty + + lhs = self._make_btwb(weights) + # TODO could use cho_factor and save the factorization to call within _calc_dof to make + # the call save time since it would only be used after the weights are finalized + np.fill_diagonal(lhs, lhs.diagonal() + penalty) + self.coef = solve( + lhs, rhs, lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, + assume_a=assume_a + ) + + output = self.basis_r @ self.coef.reshape(self._num_bases) @ self.basis_c.T + + return output + + @property + def basis(self): + """ + The full spline basis matrix. + + This is a lazy implementation since the full basis is typically not needed for + computations. + + """ + if not self._using_svd: + # Could maybe just make a basis using identities? But this should not be called + # from outside so no reason to implement + raise ValueError('No basis matrix when not using eigendecomposition') + + if self._basis is None: + self._basis = kron(self.basis_r, self.basis_c) + return self._basis + + def _calc_dof(self, weights, assume_a='pos'): + if not self._using_svd: + # Could maybe just output a matrix of ones? + raise ValueError('Cannot calculate degrees of freedom when not using eigendecomposition') + lhs = self._make_btwb(weights) + rhs = lhs.copy() + np.fill_diagonal(lhs, lhs.diagonal() + self.penalty) + dof = solve( + lhs, rhs, lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, + assume_a=assume_a + ) + + return dof.diagonal().reshape(self._num_bases) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index fa94ae2..bf775f2 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -111,7 +111,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) if weights is not None: - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) else: # perform 2 iterations: first is a least-squares fit and second is initial # reweighted fit; 2 fits are needed to get weights to have a decent starting @@ -124,7 +124,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di ParameterWarning, stacklevel=2 ) for _ in range(2): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) weight_array = _weighting._asls(y, baseline, p) # now perform the expectation-maximization @@ -191,7 +191,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di break weight_array = new_weights - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) residual = y - baseline # TODO could potentially return a BSpline object from scipy.interpolate @@ -279,7 +279,7 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, old_coef = np.zeros(self.pspline._num_bases[0] * self.pspline._num_bases[1]) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) calc_difference = relative_difference(old_coef, self.pspline.coef) tol_history[i] = calc_difference if calc_difference < tol: @@ -370,7 +370,7 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -467,7 +467,7 @@ def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, ) # B.T @ P_1 @ B and B.T @ P_1 @ y - penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1, use_banded=False) + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) p1_partial_penalty = self.pspline.basis.T @ penalized_system_1.penalty partial_rhs = p1_partial_penalty @ y.ravel() @@ -475,7 +475,7 @@ def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.pspline.solve_pspline(y, weight_array**2, rhs_extra=partial_rhs) + baseline = self.pspline.solve(y, weight_array**2, rhs_extra=partial_rhs) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -552,7 +552,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): try: - output = self.pspline.solve_pspline(y, weight_array) + output = self.pspline.solve(y, weight_array) except np.linalg.LinAlgError: warnings.warn( ('error occurred during fitting, indicating that "tol"' @@ -653,7 +653,7 @@ def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) new_weights = _weighting._arpls(y, baseline) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -731,7 +731,7 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde ) tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) new_weights = _weighting._iarpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) tol_history[i - 1] = calc_difference @@ -838,7 +838,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr k = np.std(y) / 10 tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.pspline.solve_pspline(y, weight_array) + baseline = self.pspline.solve(y, weight_array) new_weights = _weighting._psalsa(y, baseline, p, k, self._len) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index f6fb0e0..9f02dcc 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -14,19 +14,16 @@ from .. import _weighting from ._algorithm_setup import _Algorithm2D from ._whittaker_utils import PenalizedSystem2D -from ..utils import ( - ParameterWarning, relative_difference -) +from ..utils import ParameterWarning, relative_difference from .._validation import _check_optional_array class _Whittaker(_Algorithm2D): """A base class for all Whittaker-smoothing-based algorithms.""" - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) - def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None): + @_Algorithm2D._register(sort_keys=('weights',)) + def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): """ Fits the baseline using asymmetric least squares (AsLS) fitting. @@ -53,6 +50,12 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). Returns ------- @@ -81,18 +84,18 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. Leiden University Medical Centre Report, 2005, 1(1). + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, use_banded=True, use_lower=True + data, lam, diff_order, weights, eigenvalues=eigenvalues ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) + baseline = self.whittaker_system.solve(y, weight_array) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -100,6 +103,10 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh break weight_array = new_weights + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} return baseline, params @@ -180,18 +187,14 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, weights = _weighting._asls(data, baseline.reshape(self._len), p) y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1, use_banded=False) + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) # (W.T @ W + P_1) @ y -> P_1 @ y + W.T @ W @ y self.whittaker_system.add_penalty(penalized_system_1.penalty) p1_y = penalized_system_1.penalty @ y tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - weight_squared = weight_array**2 - baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_squared), - weight_squared * y + p1_y - ) + baseline = self.whittaker_system.solve(y, weight_array**2, rhs_extra=p1_y) new_weights = _weighting._asls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -203,10 +206,9 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) - def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None): + @_Algorithm2D._register(sort_keys=('weights',)) + def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): """ Adaptive iteratively reweighted penalized least squares (airPLS) baseline. @@ -229,6 +231,12 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). Returns ------- @@ -250,9 +258,12 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non Zhang, Z.M., et al. Baseline correction using adaptive iteratively reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + """ y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, copy_weights=True, use_banded=True, use_lower=True + data, lam, diff_order, weights, copy_weights=True, eigenvalues=eigenvalues ) y_l1_norm = np.abs(y).sum() @@ -263,10 +274,7 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non # sometimes not so have to also catch any errors from the solvers for i in range(1, max_iter + 2): try: - output = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) + output = self.whittaker_system.solve(y, weight_array) except np.linalg.LinAlgError: warnings.warn( ('error occurred during fitting, indicating that "tol"' @@ -280,7 +288,7 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non residual = y - baseline neg_mask = residual < 0 neg_residual = residual[neg_mask] - if len(neg_residual) < 2: + if neg_residual.size < 2: # exit if there are < 2 negative residuals since all points or all but one # point would get a weight of 0, which fails the solver warnings.warn( @@ -300,14 +308,17 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non weight_array[neg_mask] = np.exp(i * neg_residual / residual_l1_norm) weight_array[~neg_mask] = 0 + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + params = {'weights': weight_array, 'tol_history': tol_history[:i]} return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) - def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None): + @_Algorithm2D._register(sort_keys=('weights',)) + def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): """ Asymmetrically reweighted penalized least squares smoothing (arPLS). @@ -330,6 +341,12 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). Returns ------- @@ -351,16 +368,16 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None Baek, S.J., et al. Baseline correction using asymmetrically reweighted penalized least squares smoothing. Analyst, 2015, 140, 250-257. + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + """ y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, use_banded=True, use_lower=True + data, lam, diff_order, weights, eigenvalues=eigenvalues ) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) + baseline = self.whittaker_system.solve(y, weight_array) new_weights = _weighting._arpls(y, baseline) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -368,6 +385,10 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None break weight_array = new_weights + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} return baseline, params @@ -435,7 +456,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif raise ValueError('diff_order must be 2 or greater') y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) - penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1, use_banded=False) + penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1) # W + P_1 + (I - eta * W) @ P_n -> P_1 + P_n + W @ (I - eta * P_n) partial_penalty = self.whittaker_system.penalty + penalized_system_1.penalty partial_penalty_2 = -eta * self.whittaker_system.penalty @@ -443,8 +464,8 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif weight_matrix = diags(weight_array) tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): - baseline = self.whittaker_system.solve( - partial_penalty + weight_matrix @ partial_penalty_2, weight_array * y, + baseline = self.whittaker_system.direct_solve( + partial_penalty + weight_matrix @ partial_penalty_2, weight_array * y ) new_weights = _weighting._drpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) @@ -471,10 +492,9 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) - def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): + @_Algorithm2D._register(sort_keys=('weights',)) + def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): """ Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). @@ -497,6 +517,12 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). Returns ------- @@ -519,16 +545,16 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, 59, 10933-10943. + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + """ y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, use_banded=True, use_lower=True + data, lam, diff_order, weights, eigenvalues=eigenvalues ) tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): - baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) + baseline = self.whittaker_system.solve(y, weight_array) new_weights = _weighting._iarpls(y, baseline, i) calc_difference = relative_difference(weight_array, new_weights) tol_history[i - 1] = calc_difference @@ -549,6 +575,10 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non break weight_array = new_weights + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + params = {'weights': weight_array, 'tol_history': tol_history[:i]} return baseline, params @@ -628,11 +658,8 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, alpha_matrix = diags(alpha_array.ravel(), format='csr') tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - lhs = alpha_matrix @ self.whittaker_system.penalty - lhs.setdiag(lhs.diagonal() + weight_array) - baseline = self.whittaker_system.solve( - lhs, weight_array * y - ) + penalty = alpha_matrix @ self.whittaker_system.penalty + baseline = self.whittaker_system.solve(y, weight_array, penalty=penalty) new_weights, residual = _weighting._aspls(y, baseline) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -648,11 +675,9 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) + @_Algorithm2D._register(sort_keys=('weights',)) def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, - weights=None): + weights=None, eigenvalues=(10, 10)): """ Peaked Signal's Asymmetric Least Squares Algorithm (psalsa). @@ -689,6 +714,12 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e weights : array-like, shape (M, N), optional The weighting array. If None (default), then the initial weights will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). Returns ------- @@ -723,27 +754,33 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e for analytical instruments. 2014 IEEE 11th International Multi-Conference on Systems, Signals, and Devices, 2014, 1-5. + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') y, weight_array = self._setup_whittaker( - data, lam, diff_order, weights, use_banded=True, use_lower=True + data, lam, diff_order, weights, eigenvalues=eigenvalues ) if k is None: k = np.std(y) / 10 + + shape = self._len if self.whittaker_system._using_svd else np.prod(self._len) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = self.whittaker_system.solve( - self.whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights = _weighting._psalsa(y, baseline, p, k, self._len[0] * self._len[1]) + baseline = self.whittaker_system.solve(y, weight_array) + new_weights = _weighting._psalsa(y, baseline, p, k, shape) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference if calc_difference < tol: break weight_array = new_weights + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} return baseline, params diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 284013a..5090c41 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -559,7 +559,7 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -1203,7 +1203,7 @@ def aspls(data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, weights=None, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional diff --git a/tests/test_banded_utils.py b/tests/test_banded_utils.py index 667d386..d13fd1f 100644 --- a/tests/test_banded_utils.py +++ b/tests/test_banded_utils.py @@ -107,6 +107,37 @@ def test_diff_penalty_diagonals_datasize_too_small(): _banded_utils.diff_penalty_diagonals(-1) + +@pytest.mark.parametrize('data_size', (10, 51)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = _banded_utils.difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + output = _banded_utils.diff_penalty_matrix(data_size, diff_order) + + assert_allclose(expected_matrix.toarray(), output.toarray(), rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('data_size', (3, 6)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix_too_few_data(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = _banded_utils.difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + if data_size <= diff_order: + with pytest.raises(ValueError): + _banded_utils.diff_penalty_matrix(data_size, diff_order) + # the actual matrix should be just zeros + actual_result = np.zeros((data_size, data_size)) + assert_allclose(actual_result, expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + else: + output = _banded_utils.diff_penalty_matrix(data_size, diff_order) + assert_allclose(output.toarray(), expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + + def test_shift_rows_2_diags(): """Ensures rows are correctly shifted for a matrix with two off-diagonals on either side.""" matrix = np.array([ diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py index ea6bc14..f2aa74a 100644 --- a/tests/two_d/test_spline.py +++ b/tests/two_d/test_spline.py @@ -112,7 +112,7 @@ def test_mixture_pdf(fraction_pos, fraction_neg): def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, - test_rtol=1e-6, test_atol=1e-12, **kwargs): + test_rtol=1e-6, test_atol=1e-12, uses_eigenvalues=True, **kwargs): """ Compares the output of the penalized spline (P-spline) versions of Whittaker functions. @@ -122,9 +122,13 @@ def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, the weighting and linear systems were correctly set up. """ + if uses_eigenvalues: + added_kwargs = {'eigenvalues': None} + else: + added_kwargs = {} whittaker_output = getattr( whittaker._Whittaker(pspline_class.x, pspline_class.z), whittaker_func - )(data, lam=lam, **kwargs)[0] + )(data, lam=lam, **kwargs, **added_kwargs)[0] num_knots = np.array(data.shape) + 1 if hasattr(pspline_class, 'class_func'): @@ -288,7 +292,8 @@ def test_diff_order_one_fails(self): def test_whittaker_comparison(self, lam, lam_1, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" compare_pspline_whittaker( - self, 'iasls', self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order, test_rtol=1e-5 + self, 'iasls', self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order, + uses_eigenvalues=False, test_rtol=1e-5 ) diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index f7a6b27..60eb535 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -75,7 +75,7 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la lam=lam, diff_order=diff_order, check_finite=False ) - output = pspline.solve_pspline(y, weights=weights.reshape(y.shape)) + output = pspline.solve(y, weights=weights.reshape(y.shape)) assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) assert_allclose(pspline.coef, expected_coeffs, rtol=1e-8, atol=1e-8) @@ -225,7 +225,7 @@ def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): pspline = _spline_utils.PSpline2D( x, z, num_knots=num_knots, spline_degree=spline_degree, diff_order=diff_order, lam=lam ) - _ = pspline.solve_pspline(y, weights=np.ones_like(y)) + fit_spline = pspline.solve(y, weights=np.ones_like(y)) # ensure tck is the knots, coefficients, and spline degree assert len(pspline.tck) == 5 @@ -260,6 +260,6 @@ def test_pspline_tck_readonly(data_fixture2d): with pytest.raises(AttributeError): pspline.tck = (1, 2, 3) - pspline.solve_pspline(y, np.ones_like(y)) + pspline.solve(y, np.ones_like(y)) with pytest.raises(AttributeError): pspline.tck = (1, 2, 3) diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 9957f51..145c1f6 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -20,9 +20,7 @@ @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) -@pytest.mark.parametrize('use_banded', (True, False)) -@pytest.mark.parametrize('use_lower', (True, False)) -def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded, use_lower): +def test_solve_penalized_system(small_data2d, diff_order, lam): """ Tests the accuracy of the penalized system solver. @@ -44,8 +42,7 @@ def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded, use_l penalty = P1 + P2 penalized_system = _whittaker_utils.PenalizedSystem2D( - small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, - use_lower=use_lower + small_data2d.shape, lam=lam, diff_order=diff_order ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 @@ -55,18 +52,14 @@ def test_solve_penalized_system(small_data2d, diff_order, lam, use_banded, use_l penalty.setdiag(penalty.diagonal() + weights) expected_result = spsolve(penalty, weights * small_data2d.flatten()) - output = penalized_system.solve( - penalized_system.add_diagonal(weights), weights * small_data2d.flatten() - ) + output = penalized_system.solve(small_data2d.flatten(), weights) assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) -@pytest.mark.parametrize('use_banded', (True, False)) -@pytest.mark.parametrize('use_lower', (True, False)) -def test_penalized_system_setup(small_data2d, diff_order, lam, use_banded, use_lower): +def test_penalized_system_setup(small_data2d, diff_order, lam): """Ensure the PenalizedSystem2D setup is correct.""" *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( lam=lam, diff_order=diff_order @@ -82,37 +75,15 @@ def test_penalized_system_setup(small_data2d, diff_order, lam, use_banded, use_l penalty = P1 + P2 penalized_system = _whittaker_utils.PenalizedSystem2D( - small_data2d.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, - use_lower=use_lower + small_data2d.shape, lam=lam, diff_order=diff_order ) assert_array_equal(penalized_system._num_bases, num_bases) - if use_banded: - assert isinstance(penalized_system.penalty, np.ndarray) - penalty = penalty.todia() - penalty_bands = penalty.data[::-1] - # PenalizedSystem2D uses a more efficient way to assign bands, but - # this way is more clear of what is going on - offsets = list(penalty.offsets) - filler = np.zeros(penalty_bands.shape[1]) - values = [] - for i in range(offsets[0], offsets[-1] + 1): - if i in offsets: - values.append(penalty_bands[offsets.index(i)]) - else: - values.append(filler) - full_penalty_bands = np.vstack(values) - if use_lower: - full_penalty_bands = full_penalty_bands[full_penalty_bands.shape[0] // 2:] - assert_allclose( - penalized_system.penalty, full_penalty_bands, rtol=1e-12, atol=1e-12 - ) - else: - assert issparse(penalized_system.penalty) - assert_allclose( - penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 - ) + assert issparse(penalized_system.penalty) + assert_allclose( + penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 + ) assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) assert_array_equal(penalized_system.lam, (lam_x, lam_z)) @@ -134,9 +105,7 @@ def test_penalized_system_negative_lam_fails(small_data2d, lam): @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @pytest.mark.parametrize('lam', (5, (3, 5))) -@pytest.mark.parametrize('use_banded', (True, False)) -@pytest.mark.parametrize('use_lower', (True, False)) -def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded, use_lower): +def test_compare_to_psplines(data_fixture2d, lam, diff_order): """ Ensures 2D Whittaker and PSpline outputs are the same for specific condition. @@ -161,46 +130,37 @@ def test_compare_to_psplines(data_fixture2d, lam, diff_order, use_banded, use_lo assert_array_equal(pspline.basis_c.shape, (len(z)), len(z)) whittaker_system = _whittaker_utils.PenalizedSystem2D( - y.shape, lam=lam, diff_order=diff_order, use_banded=use_banded, use_lower=use_lower + y.shape, lam=lam, diff_order=diff_order ) # TODO replace with np.random.default_rng when min numpy version is >= 1.17 weights = np.random.RandomState(0).normal(0.8, 0.05, y.shape) weights = np.clip(weights, 0, 1).astype(float, copy=False) - spline_output = pspline.solve_pspline(y, weights=weights) - whittaker_output = whittaker_system.solve( - whittaker_system.add_diagonal(weights.ravel()), weights.ravel() * y.ravel() - ) + spline_output = pspline.solve(y, weights=weights) + whittaker_output = whittaker_system.solve(y.ravel(), weights=weights.ravel()) assert_allclose(whittaker_output.reshape(y.shape), spline_output, rtol=1e-12, atol=1e-12) -@pytest.mark.parametrize('data_size', (10, 51)) @pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) -def test_diff_penalty_matrix(data_size, diff_order): - """Ensures the penalty matrix shortcut works correctly.""" - diff_matrix = difference_matrix(data_size, diff_order) - expected_matrix = diff_matrix.T @ diff_matrix +def test_penalized_system_add_penalty(diff_order): + """Tests adding a penalty to a PenalizedSystem2D.""" + data_size = (40, 51) + lam = 5 - output = _whittaker_utils.diff_penalty_matrix(data_size, diff_order) + whittaker_system = _whittaker_utils.PenalizedSystem2D( + data_size, lam=lam, diff_order=diff_order + ) + added_penalty = 5 * identity(np.prod(data_size)) - assert_allclose(expected_matrix.toarray(), output.toarray(), rtol=1e-12, atol=1e-12) + expected_output = (added_penalty + whittaker_system.penalty).toarray() + expected_diagonal = expected_output.diagonal() + output = whittaker_system.add_penalty(added_penalty) -@pytest.mark.parametrize('data_size', (3, 6)) -@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) -def test_diff_penalty_matrix_too_few_data(data_size, diff_order): - """Ensures the penalty matrix shortcut works correctly.""" - diff_matrix = difference_matrix(data_size, diff_order) - expected_matrix = diff_matrix.T @ diff_matrix - - if data_size <= diff_order: - with pytest.raises(ValueError): - _whittaker_utils.diff_penalty_matrix(data_size, diff_order) - # the actual matrix should be just zeros - actual_result = np.zeros((data_size, data_size)) - assert_allclose(actual_result, expected_matrix.toarray(), rtol=1e-12, atol=1e-12) - else: - output = _whittaker_utils.diff_penalty_matrix(data_size, diff_order) - assert_allclose(output.toarray(), expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(output.toarray(), expected_output, rtol=1e-12, atol=1e-13) + # should also modify the penalty attribute + assert_allclose(whittaker_system.penalty.toarray(), expected_output, rtol=1e-12, atol=1e-13) + # and the main diagonal + assert_allclose(whittaker_system.main_diagonal, expected_diagonal, rtol=1e-12, atol=1e-13) From 9c2df3bdcbcd09aa037a8e6c08196e8d31a13512 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:16:07 -0500 Subject: [PATCH 45/56] MAINT: Modified PSpline2D tck to work with scipy's NdBSpline --- pybaselines/two_d/_spline_utils.py | 17 ++++++++++++++--- tests/two_d/test_spline_utils.py | 23 ++++++++++++++++++----- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index d23de3f..2e5550f 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -257,8 +257,9 @@ def tck(self): """ The knots, spline coefficients, and spline degree to reconstruct the spline. - Convenience function for potentially reconstructing the last solved spline with outside - modules, although not sure if Scipy has a 2D equiavlent to its `BSpline`. + Convenience function for easily reconstructing the last solved spline with outside + modules, such as with Scipy's `NdBSpline`, to allow for other usages such as evaulating + with different x- and z-values. Raises ------ @@ -266,9 +267,19 @@ def tck(self): Raised if `solve_pspline` has not been called yet, meaning that the spline has not yet been constructed. + Notes + ----- + To use with :class:`scipy.interpolate.NdBSpline`, the setup would look like: + + from scipy.interpolate import NdBspline + pspline = Pspline2D(x, z, ...) + pspline_fit = pspline.solve(...) + XZ = np.array(np.meshgrid(x, z)).T # same as zipping the meshgrid and rearranging + fit = NdBSpline(pspline.tck)(XZ) # fit == pspline_fit + """ if self.coef is None: raise ValueError('No spline coefficients, need to call "solve_pspline" first.') return ( - self.knots_r, self.knots_c, self.coef, self.spline_degree[0], self.spline_degree[1] + (self.knots_r, self.knots_c), self.coef.reshape(self._num_bases), self.spline_degree ) diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 60eb535..c2038a9 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -228,12 +228,12 @@ def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): fit_spline = pspline.solve(y, weights=np.ones_like(y)) # ensure tck is the knots, coefficients, and spline degree - assert len(pspline.tck) == 5 - knots_r, knots_c, coeffs, degree_x, degree_z = pspline.tck + assert len(pspline.tck) == 3 + (knots_r, knots_c), coeffs, (degree_x, degree_z) = pspline.tck - assert_allclose(knots_r, pspline.knots_r, rtol=1e-12) - assert_allclose(knots_c, pspline.knots_c, rtol=1e-12) - assert_allclose(coeffs, pspline.coef, rtol=1e-12) + assert_allclose(knots_r, pspline.knots_r, rtol=1e-12, atol=1e-12) + assert_allclose(knots_c, pspline.knots_c, rtol=1e-12, atol=1e-12) + assert_allclose(coeffs, pspline.coef.reshape(pspline._num_bases), rtol=1e-12, atol=1e-12) if isinstance(spline_degree, int): assert degree_x == spline_degree assert degree_z == spline_degree @@ -241,6 +241,19 @@ def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): assert degree_x == spline_degree[0] assert degree_z == spline_degree[1] + # Now recreate the spline with scipy's NdBSpline and ensure it is the same; + # NdBSpline was introduced in scipy 1.12.0 + import scipy + major, minor = [int(val) for val in scipy.__version__.split('.')[:2]] + if major > 1 or (major == 1 and minor >= 12): + from scipy.interpolate import NdBSpline + # np.array(np.meshgrid(x, z)).T is the same as doing + # np.array(np.meshgrid(x, z, indexing='ij')).transpose([1, 2, 0]), which + # is just zipping the meshgrid of each x and z value + recreated_spline = NdBSpline(*pspline.tck)(np.array(np.meshgrid(x, z)).T) + + assert_allclose(recreated_spline, fit_spline, rtol=1e-10, atol=1e-12) + def test_pspline_tck_none(data_fixture2d): """Ensures an exception is raised when tck attribute is accessed without first solving once.""" From ba7c95c9ea8e49252dcfbad64efbf08665cee290 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Thu, 8 Feb 2024 20:19:13 -0500 Subject: [PATCH 46/56] DOCS: Speed up the whittaker solver example Makes the sparse solver just a tad faster to better represent it. Also mention that the sparse solution could be sped up with CHOLMOD in case others are interested. Also re-enable autosectionlabel since the branch rebase must have undone that. --- docs/conf.py | 4 +--- examples/misc/plot_beads_preprocessing.py | 2 +- examples/whittaker/plot_whittaker_solvers.py | 12 +++++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 6fad0af..523555d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,9 +41,7 @@ 'sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - # austosectionlabel allows referencing sections by their titles, but was throwing lots of duplicate object warnings - # since changes.rst and CHANGELOG.rst have the same section headers - #'sphinx.ext.autosectionlabel', + 'sphinx.ext.autosectionlabel', 'matplotlib.sphinxext.plot_directive', 'sphinx_gallery.gen_gallery' ] diff --git a/examples/misc/plot_beads_preprocessing.py b/examples/misc/plot_beads_preprocessing.py index 4f3343b..d5ade81 100644 --- a/examples/misc/plot_beads_preprocessing.py +++ b/examples/misc/plot_beads_preprocessing.py @@ -9,7 +9,7 @@ the data to be at zero. This example will explore the consequences of this as well as a preprocessing step proposed by `Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms using the BEADS algorithm. Journal of Chromatography -A, 2017, 1507, 1-10` that helps to address this issue. +A, 2017, 1507, 1-10` implemented in pybaselines that helps to address this issue. """ # sphinx_gallery_thumbnail_number = 4 diff --git a/examples/whittaker/plot_whittaker_solvers.py b/examples/whittaker/plot_whittaker_solvers.py index 65fa06a..a12e810 100644 --- a/examples/whittaker/plot_whittaker_solvers.py +++ b/examples/whittaker/plot_whittaker_solvers.py @@ -17,6 +17,10 @@ is ~50-70% faster and pentapy's banded solver is ~70-90% faster, ultimately reducing the computation time by about an order of magnitude. +Note that the performance of solving the sparse system can be improved by using +`CHOLMOD from SuiteSparse `_, which has +Python bindings provided by `scikit-sparse `_. + """ import time @@ -24,7 +28,6 @@ import matplotlib.pyplot as plt import numpy as np -from scipy.sparse import spdiags from scipy.sparse.linalg import spsolve from pybaselines import whittaker, _banded_utils @@ -51,12 +54,11 @@ def sparse_asls(data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weig diff_matrix = difference_matrix(num_y, diff_order, 'csc') penalty_matrix = lam * (diff_matrix.T @ diff_matrix) + original_diag = penalty_matrix.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = spsolve( - spdiags(weight_array, 0, num_y, num_y, 'csr') + penalty_matrix, - weight_array * y, 'NATURAL' - ) + penalty_matrix.setdiag(weight_array + original_diag) + baseline = spsolve(penalty_matrix, weight_array * y, 'NATURAL') mask = y > baseline new_weights = p * mask + (1 - p) * (~mask) calc_difference = relative_difference(weight_array, new_weights) From 0f3780411de05a4ae3ae975339e2699f9e5b0458 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 10 Feb 2024 13:18:20 -0500 Subject: [PATCH 47/56] MAINT : Was not updating alpha_matrix for 2D aspls Also added a sanity check test for the 1D case to ensure the banded multiplication is the same as the matrix multiplication. --- pybaselines/two_d/whittaker.py | 12 +++++++++--- tests/test_whittaker.py | 18 ++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index 9f02dcc..e4c628f 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -15,6 +15,7 @@ from ._algorithm_setup import _Algorithm2D from ._whittaker_utils import PenalizedSystem2D from ..utils import ParameterWarning, relative_difference +from ..utils import _MIN_FLOAT, ParameterWarning, relative_difference from .._validation import _check_optional_array @@ -461,7 +462,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif partial_penalty = self.whittaker_system.penalty + penalized_system_1.penalty partial_penalty_2 = -eta * self.whittaker_system.penalty partial_penalty_2.setdiag(partial_penalty_2.diagonal() + 1) - weight_matrix = diags(weight_array) + weight_matrix = diags(weight_array, format='csr') tol_history = np.empty(max_iter + 1) for i in range(1, max_iter + 2): baseline = self.whittaker_system.direct_solve( @@ -654,7 +655,9 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, if self._sort_order is not None and alpha is not None: alpha_array = alpha_array[self._sort_order] - # use a sparse matrix to maintain sparsity after multiplication + # use a sparse matrix to maintain sparsity after multiplication; implementation note: + # could skip making an alpha matrix and just use alpha_array[:, None] * penalty once + # the scipy sparse_arrays become standard -> will have to check if timing is affected alpha_matrix = diags(alpha_array.ravel(), format='csr') tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): @@ -666,8 +669,11 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, if calc_difference < tol: break weight_array = new_weights - abs_d = np.abs(residual) + # add _MIN_FLOAT so that no values are 0; otherwise, the sparsity of alpha @ penalty + # can change, which is inefficient + abs_d = np.abs(residual) + _MIN_FLOAT alpha_array = abs_d / abs_d.max() + alpha_matrix.setdiag(alpha_array) params = { 'weights': weight_array, 'alpha': alpha_array, 'tol_history': tol_history[:i + 1] diff --git a/tests/test_whittaker.py b/tests/test_whittaker.py index d6a5ad8..8e9400f 100644 --- a/tests/test_whittaker.py +++ b/tests/test_whittaker.py @@ -11,6 +11,7 @@ import numpy as np from numpy.testing import assert_allclose import pytest +from scipy.sparse import diags from pybaselines import _banded_utils, whittaker from pybaselines.utils import ParameterWarning @@ -278,6 +279,23 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_alpha_multiplication(self, diff_order): + """Ensures multiplication of the alpha array and banded penalty is handled correctly.""" + lam = 5. + num_points = len(self.y) + alpha = np.arange(num_points, dtype=float) + penalized_system = _banded_utils.PenalizedSystem( + num_points, lam=lam, diff_order=diff_order, allow_lower=False, reverse_diags=True + ) + penalty_matrix = lam * _banded_utils.diff_penalty_matrix(num_points, diff_order=diff_order) + + expected_result = (diags(alpha) @ penalty_matrix).todia().data[::-1] + + result = alpha * penalized_system.penalty + result = _banded_utils._shift_rows(result, diff_order, diff_order) + assert_allclose(result, expected_result, rtol=1e-13, atol=1e-13) + class TestPsalsa(WhittakerTester): """Class for testing psalsa baseline.""" From c622ed500a810cb84c4a9503973b2dc5505bb833 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 10 Feb 2024 15:47:45 -0500 Subject: [PATCH 48/56] OTH: Internally use scipy's sparse arrays when available Will use scipy's sparse arrays if the installed scipy version is 1.12 or newer. Thank goodness for unit tests catching the change in matrix multiplication. --- pybaselines/_banded_utils.py | 15 +-- pybaselines/_compat.py | 149 ++++++++++++++++++++- pybaselines/_spline_utils.py | 14 +- pybaselines/_validation.py | 6 +- pybaselines/misc.py | 21 +-- pybaselines/optimizers.py | 2 +- pybaselines/spline.py | 5 +- pybaselines/two_d/_spline_utils.py | 11 +- pybaselines/two_d/_whittaker_utils.py | 4 +- pybaselines/two_d/whittaker.py | 3 +- pybaselines/utils.py | 2 +- tests/test_algorithm_setup.py | 6 +- tests/test_banded_utils.py | 33 +++-- tests/test_classification.py | 3 +- tests/test_compat.py | 185 +++++++++++++++++++++++++- tests/test_misc.py | 81 +++++------ tests/test_spline.py | 4 +- tests/test_spline_utils.py | 11 +- tests/test_utils.py | 10 +- tests/test_whittaker.py | 2 +- tests/two_d/test_algorithm_setup.py | 3 +- tests/two_d/test_spline_utils.py | 3 +- tests/two_d/test_whittaker_utils.py | 3 +- 23 files changed, 452 insertions(+), 124 deletions(-) diff --git a/pybaselines/_banded_utils.py b/pybaselines/_banded_utils.py index a2a5f43..f97d76b 100644 --- a/pybaselines/_banded_utils.py +++ b/pybaselines/_banded_utils.py @@ -8,9 +8,8 @@ import numpy as np from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import identity, diags, spdiags -from ._compat import _HAS_PENTAPY, _pentapy_solve +from ._compat import _HAS_PENTAPY, _pentapy_solve, identity, diags, dia_object from ._validation import _check_lam @@ -205,7 +204,7 @@ def difference_matrix(data_size, diff_order=2, diff_format=None): Returns ------- - diff_matrix : scipy.sparse.base.spmatrix + diff_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray The sparse difference matrix. Raises @@ -502,7 +501,7 @@ def diff_penalty_matrix(data_size, diff_order=2, diff_format='csr'): Returns ------- - penalty_matrix : scipy.sparse.base.spmatrix + penalty_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray The sparse difference penalty matrix. Raises @@ -525,10 +524,10 @@ def diff_penalty_matrix(data_size, diff_order=2, diff_format='csr'): if data_size <= diff_order: raise ValueError('data size must be greater than or equal to the difference order.') penalty_bands = diff_penalty_diagonals(data_size, diff_order, lower_only=False) - penalty_matrix = spdiags( - penalty_bands, np.arange(diff_order, -diff_order - 1, -1), data_size, data_size, - format=diff_format - ) + penalty_matrix = dia_object( + (penalty_bands, np.arange(diff_order, -diff_order - 1, -1)), shape=(data_size, data_size), + ).asformat(diff_format) + return penalty_matrix diff --git a/pybaselines/_compat.py b/pybaselines/_compat.py index cba7363..4243dda 100644 --- a/pybaselines/_compat.py +++ b/pybaselines/_compat.py @@ -6,9 +6,10 @@ """ -from functools import wraps +from functools import lru_cache, wraps -from scipy import integrate +import scipy +from scipy import integrate, sparse try: @@ -52,8 +53,150 @@ def wrapper(*args, **kwargs): # scipy.integrate.trapezoid was introduced in v1.6.0, while # scipy.integrate.trapz will be deprecated in v1.14.0. # Use scipy instead of numpy since numpy.trapz will be deprecated -# in v2.0.0 +# in v2.0.0 -> the deprecation was stopped (delayed?), but rely +# on scipy since there is no potential deprecation there if hasattr(integrate, 'trapezoid'): trapezoid = integrate.trapezoid else: trapezoid = integrate.trapz + + +@lru_cache(maxsize=1) +def _use_sparse_arrays(): + """ + Checks that the installed scipy version is new enough to use sparse arrays. + + This check is wrapped into a function just in case it fails so that pybaselines + can still be imported without error. The result is cached so it only has to + be done once. + + Returns + ------- + bool + True if the installed scipy version is above 1.12; False otherwise. + + Notes + ----- + Scipy introduced its sparse arrays in version 1.8, but the interface and helper + functions were not stable until version 1.12; a warning will be emitted in scipy + 1.13 when using the matrix interface, so want to use the sparse array interface + as early as possible. + + """ + try: + _scipy_version = [int(val) for val in scipy.__version__.lstrip('v').split('.')[:2]] + except Exception: + # in case in the far future scipy stops using semantic versioning; probably + # bigger problems than this check at that point so just return True + return True + + return _scipy_version[0] > 1 or (_scipy_version[0] == 1 and _scipy_version[1] >= 12) + + +def dia_object(*args, **kwargs): + """ + Handles creation of a sparse diagonal object. + + Parameters + ---------- + *args + Any arguments to pass to the creation functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.dia_matrix or scipy.sparse.dia_array + A sparse diagonal matrix if the intalled scipy version is older than 1.12, + otherwise a sparse diagonal array. + + """ + if _use_sparse_arrays(): + return sparse.dia_array(*args, **kwargs) + else: + return sparse.dia_matrix(*args, **kwargs) + + +def csr_object(*args, **kwargs): + """ + Handles creation of a sparse csr object. + + Parameters + ---------- + *args + Any arguments to pass to the creation functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.csr_matrix or scipy.sparse.csr_array + A sparse csr matrix if the intalled scipy version is older than 1.12, + otherwise a sparse csr array. + + """ + if _use_sparse_arrays(): + return sparse.csr_array(*args, **kwargs) + else: + return sparse.csr_matrix(*args, **kwargs) + + +def identity(size, format=None, **kwargs): + """ + Handles creation of a sparse square identity matrix. + + Parameters + ---------- + size : int + The length of the rows and columns of the sparse matrix. + format : str, optional + The sparse format to use for the identiy matrix. Default is None, which + will use the default of the underlying functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The sparse identity matrix. + + Notes + ----- + This function will need to be updated in the future to prefer sparse.identity again + once the sparse matrices are removed. + + """ + if _use_sparse_arrays(): + return sparse.eye_array(size, size, format=format, **kwargs) + else: + return sparse.identity(size, format=format, **kwargs) + + +def diags(data, offsets=0, **kwargs): + """ + Handles creation of a sparse diagonal matrix. + + Parameters + ---------- + data : array-like + The data to be put in the diagonals. + offsets : int or Sequence[int], optional + The offsets for `data`. Default is 0, which is the main diagonal. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The sparse identiy matrix. + + Notes + ----- + This function will need to be updated in the future to prefer sparse.diags again + once the sparse matrices are removed. + + """ + if _use_sparse_arrays(): + return sparse.diags_array(data, offsets=offsets, **kwargs) + else: + return sparse.diags(data, offsets=offsets, **kwargs) diff --git a/pybaselines/_spline_utils.py b/pybaselines/_spline_utils.py index bfe3785..a83fba7 100644 --- a/pybaselines/_spline_utils.py +++ b/pybaselines/_spline_utils.py @@ -45,10 +45,9 @@ import numpy as np from scipy.interpolate import BSpline, splev from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import csc_matrix, csr_matrix, spdiags from ._banded_utils import _add_diagonals, _lower_to_full, PenalizedSystem -from ._compat import _HAS_NUMBA, jit +from ._compat import _HAS_NUMBA, csr_object, dia_object, jit from ._validation import _check_array @@ -224,7 +223,7 @@ def _make_design_matrix(x, knots, spline_degree): """ data, row_ind, col_ind = __make_design_matrix(x, knots, spline_degree) - return csr_matrix((data, (row_ind, col_ind)), (len(x), len(knots) - spline_degree - 1)) + return csr_object((data, (row_ind, col_ind)), (len(x), len(knots) - spline_degree - 1)) def _slow_design_matrix(x, knots, spline_degree): @@ -273,7 +272,7 @@ def _slow_design_matrix(x, knots, spline_degree): basis[spline_degree, 0] = small_float basis[-(spline_degree + 1), -1] = small_float - return csc_matrix(basis).T + return csr_object(basis.T) def _spline_knots(x, num_knots=10, spline_degree=3, penalized=True): @@ -575,7 +574,7 @@ def _solve_pspline(x, y, weights, basis, penalty, knots, spline_degree, rhs_extr # worst case scenario; have to convert weights to a sparse diagonal matrix, # do B.T @ W @ B, and convert back to lower banded len_y = len(y) - full_matrix = basis.T @ spdiags(weights, 0, len_y, len_y, 'csr') @ basis + full_matrix = basis.T @ dia_object((weights, 0), shape=(len_y, len_y)).tocsr() @ basis rhs = basis.T @ (weights * y) ab = full_matrix.todia().data[::-1] # take only the lower diagonals of the symmetric ab; cannot just do @@ -887,8 +886,11 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): # worst case scenario; have to convert weights to a sparse diagonal matrix, # do B.T @ W @ B, and convert back to lower banded full_matrix = ( - self.basis.T @ spdiags(weights, 0, self._x_len, self._x_len, 'csr') @ self.basis + self.basis.T + @ dia_object((weights, 0), shape=(self._x_len, self._x_len)).tocsr() + @ self.basis ) + rhs = self.basis.T @ (weights * y) ab = full_matrix.todia().data[::-1] # take only the lower diagonals of the symmetric ab; cannot just do diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index 0018feb..b322e2d 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -350,7 +350,7 @@ def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, return y, x, z -def _check_lam(lam, allow_zero=False): +def _check_lam(lam, allow_zero=False, dtype=float): """ Ensures the regularization parameter `lam` is a scalar greater than 0. @@ -361,6 +361,8 @@ def _check_lam(lam, allow_zero=False): penalized splines. allow_zero : bool If False (default), only allows `lam` values > 0. If True, allows `lam` >= 0. + dtype : type or numpy.dtype, optional + The dtype to cast the lam value. Default is float. Returns ------- @@ -392,7 +394,7 @@ def _check_lam(lam, allow_zero=False): ``(diags(lam) @ D.T @ D).todia().data[::-1]``. """ - return _check_scalar_variable(lam, allow_zero) + return _check_scalar_variable(lam, allow_zero, dtype=dtype) def _check_half_window(half_window, allow_zero=False, two_d=False): diff --git a/pybaselines/misc.py b/pybaselines/misc.py index 8c23d28..5094cf6 100644 --- a/pybaselines/misc.py +++ b/pybaselines/misc.py @@ -68,11 +68,10 @@ from scipy.interpolate import interp1d from scipy.linalg import get_blas_funcs, solve_banded, solveh_banded from scipy.ndimage import uniform_filter1d -from scipy.sparse import spdiags from scipy.sparse.linalg import splu, spsolve from ._algorithm_setup import _Algorithm, _class_wrapper -from ._compat import _HAS_NUMBA, jit +from ._compat import _HAS_NUMBA, dia_object, jit from ._validation import _check_array, _check_lam from .utils import _MIN_FLOAT, relative_difference @@ -640,8 +639,8 @@ def _high_pass_filter(data_size, freq_cutoff=0.005, filter_type=1, full_matrix=F b_diags = np.repeat(b.reshape(1, -1), data_size, axis=0).T if full_matrix: offsets = np.arange(-filter_type, filter_type + 1) - A = spdiags(a_diags, offsets, data_size, data_size, 'csr') - B = spdiags(b_diags, offsets, data_size, data_size, 'csr') + A = dia_object((a_diags, offsets), shape=(data_size, data_size)).tocsr() + B = dia_object((b_diags, offsets), shape=(data_size, data_size)).tocsr() else: # add zeros on edges to create the actual banded structure; # creates same structure as diags(a[b]_diags, offsets).todia().data[::-1] @@ -915,7 +914,7 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet # factorize A since A is unchanged in the function and its factorization # is used repeatedly; much faster than calling spsolve each time A_factor = splu(A.tocsc(), permc_spec='NATURAL') - BTB = B * B + BTB = B @ B x = y d1_x, d2_x = _abs_diff(x, smooth_half_window) @@ -929,7 +928,7 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): # calculate line 6 of Table 3 in beads paper using banded matrices rather - # than sparse matrices since it is much faster; Gamma + D.T * Lambda * D + # than sparse matrices since it is much faster; Gamma + D.T @ Lambda @ D # row 1 and 3 instead of 0 and 2 to account for zeros on top and bottom d1_diags[1][1:] = d1_diags[3][:-1] = -_beads_weighting(d1_x, use_v2_loss, eps_1) @@ -945,12 +944,14 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet gamma[big_x] = gamma_factor / abs_x[big_x] d_diags[2] += gamma + # TODO check that 'NATURAL' is the appropriate permutation scheme for this x = A.dot( spsolve( - BTB + A.dot(spdiags(d_diags, offsets, num_y, num_y, 'csr').dot(A)), + BTB + A.dot(dia_object((d_diags, offsets), shape=(num_y, num_y)).tocsr()).dot(A), d, 'NATURAL' ) ) + h = B.dot(A_factor.solve(y - x)) d1_x, d2_x = _abs_diff(x, smooth_half_window) abs_x, big_x, theta = _beads_theta(x, asymmetry, eps_0) @@ -1063,11 +1064,11 @@ def _banded_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet A, B = _high_pass_filter(num_y, freq_cutoff, filter_type, False) # the number of lower and upper diagonals for both A and B ab_lu = (filter_type, filter_type) - # the shape of A and B, and D.T*D matrices in their full forms rather than banded forms + # the shape of A and B, and D.T @ D matrices in their full forms rather than banded forms full_shape = (num_y, num_y) A_lower = A[filter_type:] BTB = _banded_dot_banded(B, B, ab_lu, ab_lu, full_shape, full_shape, True) - # number of lower and upper diagonals of A.T * (D.T * D) * A + # number of lower and upper diagonals of A.T @ (D.T @ D) @ A num_diags = (2 * filter_type + 2, 2 * filter_type + 2) # line 2 of Table 3 in beads paper @@ -1091,7 +1092,7 @@ def _banded_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): # calculate line 6 of Table 3 in beads paper using banded matrices rather - # than sparse matrices since it is much faster; Gamma + D.T * Lambda * D + # than sparse matrices since it is much faster; Gamma + D.T @ Lambda @ D # row 1 and 3 instead of 0 and 2 to account for zeros on top and bottom d1_diags[1][1:] = d1_diags[3][:-1] = -_beads_weighting(d1_x, use_v2_loss, eps_1) diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index a7a535e..4afb93e 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -16,7 +16,7 @@ from . import classification, misc, morphological, polynomial, smooth, spline, whittaker from ._algorithm_setup import _Algorithm, _class_wrapper from ._validation import _check_optional_array -from .utils import _check_scalar, _get_edges, _sort_array, gaussian, whittaker_smooth +from .utils import _check_scalar, _get_edges, _sort_array, gaussian class _Optimizers(_Algorithm): diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 59a9e87..6cf4920 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -13,12 +13,11 @@ import numpy as np from scipy.ndimage import grey_opening from scipy.optimize import curve_fit -from scipy.sparse import spdiags from . import _weighting from ._algorithm_setup import _Algorithm, _class_wrapper from ._banded_utils import _add_diagonals, _shift_rows, diff_penalty_diagonals -from ._compat import _HAS_NUMBA, jit, trapezoid +from ._compat import _HAS_NUMBA, dia_object, jit, trapezoid from ._spline_utils import _basis_midpoints from ._validation import _check_lam, _check_optional_array from .utils import ( @@ -546,7 +545,7 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, d1_penalty = _check_lam(lam_1) * diff_penalty_diagonals(self._len, 1, lower_only=False) d1_penalty = ( self.pspline.basis.T - @ spdiags(d1_penalty, np.array([1, 0, -1]), self._len, self._len, 'csr') + @ dia_object((d1_penalty, np.array([1, 0, -1])), shape=(self._len, self._len)).tocsr() ) partial_rhs = d1_penalty @ y # now change d1_penalty back to banded array diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 2e5550f..4bba4ea 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -7,9 +7,10 @@ """ import numpy as np -from scipy import sparse +from scipy.sparse import kron from scipy.sparse.linalg import spsolve +from .._compat import csr_object from .._spline_utils import _spline_basis, _spline_knots from .._validation import _check_array, _check_scalar from ._whittaker_utils import PenalizedSystem2D @@ -130,8 +131,8 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam el = np.ones((1, self._num_bases[0])) ek = np.ones((1, self._num_bases[1])) - self._G_r = sparse.kron(self.basis_r, el).multiply(sparse.kron(el, self.basis_r)) - self._G_c = sparse.kron(self.basis_c, ek).multiply(sparse.kron(ek, self.basis_c)) + self._G_r = kron(self.basis_r, el).multiply(kron(el, self.basis_r)) + self._G_c = kron(self.basis_c, ek).multiply(kron(ek, self.basis_c)) def same_basis(self, num_knots=100, spline_degree=3): """ @@ -217,7 +218,7 @@ def solve(self, y, weights, penalty=None, rhs_extra=None): """ # do not save intermediate results since they are memory intensive for high number of knots - F = sparse.csr_matrix( + F = csr_object( np.transpose( (self._G_r.T @ weights @ self._G_c).reshape( (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) @@ -249,7 +250,7 @@ def basis(self): """ if self._basis is None: - self._basis = sparse.kron(self.basis_r, self.basis_c) + self._basis = kron(self.basis_r, self.basis_c) return self._basis @property diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index b716005..322a482 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -8,10 +8,11 @@ import numpy as np from scipy.linalg import eig_banded, eigh_tridiagonal, solve -from scipy.sparse import identity, kron +from scipy.sparse import kron from scipy.sparse.linalg import spsolve from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix +from .._compat import identity from .._validation import _check_lam, _check_scalar @@ -250,6 +251,7 @@ def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): else: # TODO need to check to ensure max_eigens is <= data_size and otherwise emit # an error; if max_eigens is >~ 40 should emit an error saying too many + # also check that it is greater than 0 or maybe 1 self._num_bases = _check_scalar(max_eigens, 2, True, dtype=int)[0] self._using_svd = True self.reset_diagonals(lam, diff_order) diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index e4c628f..9239624 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -9,12 +9,11 @@ import warnings import numpy as np -from scipy.sparse import diags from .. import _weighting +from .. _compat import diags from ._algorithm_setup import _Algorithm2D from ._whittaker_utils import PenalizedSystem2D -from ..utils import ParameterWarning, relative_difference from ..utils import _MIN_FLOAT, ParameterWarning, relative_difference from .._validation import _check_optional_array diff --git a/pybaselines/utils.py b/pybaselines/utils.py index ae61a1a..cf944b9 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -712,7 +712,7 @@ def difference_matrix(data_size, diff_order=2, diff_format=None): Returns ------- - diff_matrix : scipy.sparse.base.spmatrix + diff_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray The sparse difference matrix. Raises diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index e752622..003de3a 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -9,10 +9,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import dia_matrix from pybaselines import _algorithm_setup, optimizers, polynomial, whittaker from pybaselines.utils import ParameterWarning +from pybaselines._compat import dia_object from .conftest import get_data @@ -48,7 +48,7 @@ def test_setup_whittaker_diff_matrix(small_data, algorithm, lam, diff_order, ) numpy_diff = np.diff(np.eye(small_data.shape[0]), diff_order, 0) - desired_diagonals = dia_matrix(lam * (numpy_diff.T @ numpy_diff)).data[::-1] + desired_diagonals = dia_object(lam * (numpy_diff.T @ numpy_diff)).data[::-1] if allow_lower and not algorithm.whittaker_system.using_pentapy: # only include the lower diagonals desired_diagonals = desired_diagonals[diff_order:] @@ -270,7 +270,7 @@ def test_setup_spline_diff_matrix(small_data, lam, diff_order, spline_degree, nu num_bases = num_knots + spline_degree - 1 numpy_diff = np.diff(np.eye(num_bases), diff_order, axis=0) - desired_diagonals = lam * dia_matrix(numpy_diff.T @ numpy_diff).data[::-1][diff_order:] + desired_diagonals = lam * dia_object(numpy_diff.T @ numpy_diff).data[::-1][diff_order:] if diff_order < spline_degree: padding = np.zeros((spline_degree - diff_order, desired_diagonals.shape[1])) desired_diagonals = np.concatenate((desired_diagonals, padding)) diff --git a/tests/test_banded_utils.py b/tests/test_banded_utils.py index d13fd1f..fbc2543 100644 --- a/tests/test_banded_utils.py +++ b/tests/test_banded_utils.py @@ -9,10 +9,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import diags, identity, spdiags from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils +from pybaselines._compat import diags, dia_object, identity from .conftest import has_pentapy @@ -304,14 +304,14 @@ def test_add_diagonals(diff_order_1, diff_order_2, lower_only): a_offsets = np.arange(diff_order_1, -diff_order_1 - 1, -1) b_offsets = np.arange(diff_order_2, -diff_order_2 - 1, -1) - a_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(points, diff_order_1, False), - a_offsets, points, points, 'csr' - ) - b_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(points, diff_order_2, False), - b_offsets, points, points, 'csr' - ) + a_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(points, diff_order_1, False), a_offsets), + shape=(points, points) + ).tocsr() + b_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(points, diff_order_2, False), b_offsets), + shape=(points, points) + ).tocsr() expected_output = (a_matrix + b_matrix).todia().data[::-1] if lower_only: expected_output = expected_output[len(expected_output) // 2:] @@ -575,11 +575,10 @@ def test_penalized_system_solve(data_fixture, diff_order, allow_lower, allow_pen expected_penalty = _banded_utils.diff_penalty_diagonals( data_size, diff_order=diff_order, lower_only=False ) - sparse_penalty = spdiags( - lam * expected_penalty, np.arange(diff_order, -(diff_order + 1), -1), - data_size, data_size, 'csr' - - ) + sparse_penalty = dia_object( + (lam * expected_penalty, np.arange(diff_order, -(diff_order + 1), -1)), + shape=(data_size, data_size) + ).tocsr() expected_solution = spsolve(identity(data_size, format='csr') + sparse_penalty, y) penalized_system = _banded_utils.PenalizedSystem( @@ -718,9 +717,9 @@ def test_penalized_system_add_diagonal_after_penalty(data_size, diff_order, allo additional_penalty = _banded_utils.diff_penalty_diagonals( data_size, penalty_order, lower_only=False ) - additional_penalty_matrix = spdiags( - additional_penalty, np.arange(penalty_order, -penalty_order - 1, -1), data_size, - data_size + additional_penalty_matrix = dia_object( + (additional_penalty, np.arange(penalty_order, -penalty_order - 1, -1)), + shape=(data_size, data_size) ) total_penalty = penalty + additional_penalty_matrix diff --git a/tests/test_classification.py b/tests/test_classification.py index d6c9009..7dabe32 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -10,7 +10,6 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest import scipy -from scipy.signal import cwt from pybaselines import classification from pybaselines.utils import ParameterWarning, whittaker_smooth @@ -261,7 +260,7 @@ def test_haar_cwt_comparison_to_pywavelets(scale): y = np.zeros(100) y[50] = 1 - haar_cwt = cwt(y, classification._haar, [scale])[0] + haar_cwt = classification._cwt(y, classification._haar, [scale])[0] # test absolute tolerance rather than relative tolerance since # some values are very close to 0 assert_allclose(haar_cwt**2, PYWAVELETS_HAAR[scale]**2, 0, 1e-14) diff --git a/tests/test_compat.py b/tests/test_compat.py index 8f90ae4..adc588a 100644 --- a/tests/test_compat.py +++ b/tests/test_compat.py @@ -6,9 +6,13 @@ """ -from numpy.testing import assert_array_equal +from unittest import mock + +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose import pytest -from scipy import integrate +import scipy +from scipy import integrate, sparse from pybaselines import _compat @@ -145,8 +149,179 @@ def _add4(a, b): def test_trapezoid(): - """Ensures the trapezoid integration function within scipy is correctly used.""" + """ + Ensures the trapezoid integration function within scipy is correctly used. + + Rather than checking equality with the expected function, just check that + it works correctly. + """ + data = [1., 2., 3.] + output = _compat.trapezoid(data) + assert_allclose(output, 4.0, rtol=0, atol=1e-14) + if hasattr(integrate, 'trapezoid'): - assert _compat.trapezoid is integrate.trapezoid + comparison_func = integrate.trapezoid + else: + comparison_func = integrate.trapz + + assert_allclose(output, comparison_func(data), rtol=0, atol=1e-14) + + +def _scipy_below_1_12(): + """ + Checks that the installed scipy version is new enough to use sparse arrays. + + This check is wrapped into a function just in case it fails so that pybaselines + can still be imported without error. The result is cached so it only has to + be done once. + + Returns + ------- + bool + True if the installed scipy version is below 1.12; False otherwise. + + Notes + ----- + Scipy introduced its sparse arrays in version 1.8, but the interface and helper + functions were not stable until version 1.12; a warning will be emitted in scipy + 1.13 when using the matrix interface, so want to use the sparse array interface + as early as possible. + + """ + try: + _scipy_version = [int(val) for val in scipy.__version__.lstrip('v').split('.')[:2]] + except Exception: + # in case in the far future scipy stops using semantic versioning; probably + # bigger problems than this check at that point so just return True + return False + + return not (_scipy_version[0] > 1 or (_scipy_version[0] == 1 and _scipy_version[1] >= 12)) + + +def test_use_sparse_arrays(): + """ + Ensures the scipy version check works correctly. + + Use try-finally so that even if the test fails, the mocked values do + not remain, which would cause subsequent tests to fail. + """ + try: + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '0.1'): + assert not _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '1.11'): + assert not _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '1.12'): + assert _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '2.0'): + assert _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + # check that it returns True when an error reading the scipy version occurs + with mock.patch.object(scipy, '__version__', 'abc'): + assert _compat._use_sparse_arrays() + finally: + _compat._use_sparse_arrays.cache_clear() + # ensure the cache is cleared so the correct value can be filled so the next call + # to it is correct + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + + +def test_dia_object(): + """Ensures the compatibilty for dia_matrix and dia_array works as intended.""" + data = np.array([ + [1, 2, 0], + [4, 5, 6], + [0, 8, 9] + ]) + offsets = [-1, 0, 1] + output = _compat.dia_object((data, offsets), shape=(3, 3)) + + expected_output = np.array([ + [4, 8, 0], + [1, 5, 9], + [0, 2, 6] + ]) + + assert sparse.issparse(output) + assert output.format == 'dia' + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +def test_csr_object(): + """Ensures the compatibilty for csr_matrix and csr_array works as intended.""" + row = np.array([0, 1, 1, 2]) + col = np.array([0, 0, 2, 0]) + data = np.array([3, 5, 7, 9]) + output = _compat.csr_object((data, (row, col)), shape=(3, 3)) + + expected_output = np.array([ + [3, 0, 0], + [5, 0, 7], + [9, 0, 0] + ]) + + assert sparse.issparse(output) + assert output.format == 'csr' + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +@pytest.mark.parametrize('sparse_format', ('csc', 'csr', 'dia')) +@pytest.mark.parametrize('size', (1, 3, 6)) +def test_identity(size, sparse_format): + """Ensures the sparse identity function works correctly.""" + output = _compat.identity(size, format=sparse_format) + + assert sparse.issparse(output) + assert output.format == sparse_format + assert_allclose(output.toarray(), np.eye(size), rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +@pytest.mark.parametrize('sparse_format', ('csc', 'csr', 'dia')) +def test_diags(sparse_format): + """Ensures the sparse diags function works as intended.""" + data = [-1, 2, 1] + offsets = [-1, 0, 1] + output = _compat.diags(data, offsets=offsets, shape=(3, 3), format=sparse_format) + + expected_output = np.array([ + [2, 1, 0], + [-1, 2, 1], + [0, -1, 2] + ]) + + assert sparse.issparse(output) + assert output.format == sparse_format + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) else: - assert _compat.trapezoid is integrate.trapz + assert not sparse.isspmatrix(output) diff --git a/tests/test_misc.py b/tests/test_misc.py index 2b31b90..35759c0 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -11,9 +11,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import dia_matrix, diags, spdiags, vstack +from scipy.sparse import vstack from pybaselines import _banded_utils, misc +from pybaselines._compat import dia_object, diags from .conftest import BaseTester, get_data @@ -162,7 +163,7 @@ def test_array_lam_fails(self): def test_banded_dot_vector(): """Ensures the dot product of a banded matrix and a vector is correct.""" # random, square, non-symmetric banded matrix - matrix_1 = dia_matrix(np.array([ + matrix_1 = dia_object(np.array([ [0, 1, 0, 0, 0], [1, 3, 4, 0, 0], [2, 4, 9, 8, 0], @@ -175,11 +176,11 @@ def test_banded_dot_vector(): banded_output_1 = misc._banded_dot_vector( bands_1, vector_1, (3, 1), matrix_1.shape ) - assert_array_equal(banded_output_1, matrix_1 * vector_1) + assert_array_equal(banded_output_1, matrix_1 @ vector_1) # random, square, symmetric banded matrix - matrix_2 = dia_matrix(np.array([ - [0, 1, 22, 0, 0, 0.0], + matrix_2 = dia_object(np.array([ + [0, 1., 22, 0, 0, 0.0], [1, 3, 4, 5, 0, 0], [22, 4, 9, 97, -3, 0], [0, 5, 97, -4, 19, 12], @@ -192,16 +193,16 @@ def test_banded_dot_vector(): banded_output_2 = misc._banded_dot_vector( bands_2, vector_2, (2, 2), matrix_2.shape ) - assert_allclose(banded_output_2, matrix_2 * vector_2, rtol=1e-11) + assert_allclose(banded_output_2, matrix_2 @ vector_2, rtol=1e-11) def test_banded_dot_banded(): """Ensures the dot product of two square banded matrices is correct.""" # random, square, non-symmetric banded matrix; tests that the number of upper and # lower diagonals in the output is capped by the shape of the matrix rather than the - # number of diagonals, since matrix_1 * matrix_1 would otherwise have more diagonals + # number of diagonals, since matrix_1 @ matrix_1 would otherwise have more diagonals # than allowed in the shape - matrix_1 = dia_matrix(np.array([ + matrix_1 = dia_object(np.array([ [0, 1, 0, 0, 0], [1, 3, 4, 0, 0], [2, 4, 9, 8, 0], @@ -210,14 +211,14 @@ def test_banded_dot_banded(): ])) bands_1 = matrix_1.todia().data[::-1] - actual_output_1 = (matrix_1 * matrix_1).todia().data[::-1] + actual_output_1 = (matrix_1 @ matrix_1).todia().data[::-1] banded_output_1 = misc._banded_dot_banded( bands_1, bands_1, (3, 1), (3, 1), matrix_1.shape, matrix_1.shape ) assert_array_equal(banded_output_1, actual_output_1) # random, square, symmetric banded matrix - matrix_2 = dia_matrix(np.array([ + matrix_2 = dia_object(np.array([ [0, 1, 22, 0, 0, 0], [1, 3, 4, 5, 0, 0], [22, 4, 9, 97, -3, 0], @@ -227,13 +228,13 @@ def test_banded_dot_banded(): ])) bands_2 = matrix_2.todia().data[::-1] - actual_output_2 = (matrix_2 * matrix_2).todia().data[::-1] + actual_output_2 = (matrix_2 @ matrix_2).todia().data[::-1] banded_output_2 = misc._banded_dot_banded( bands_2, bands_2, (2, 2), (2, 2), matrix_2.shape, matrix_2.shape ) assert_array_equal(banded_output_2, actual_output_2) - # also test symmetric_output=True since matrix_2 * matrix_2 is also symmetric + # also test symmetric_output=True since matrix_2 @ matrix_2 is also symmetric banded_output_3 = misc._banded_dot_banded( bands_2, bands_2, (2, 2), (2, 2), matrix_2.shape, matrix_2.shape, True ) @@ -289,8 +290,8 @@ def test_high_pass_filter_simple(filter_type): [0., 4.58885438, 10.35541753, 27.53312629, 10.35541753], [0., 0., 4.58885438, 10.35541753, 27.53312629] ]) - desired_A_banded = dia_matrix(desired_A_full).data[::-1] - desired_B_banded = dia_matrix(desired_B_full).data[::-1] + desired_A_banded = dia_object(desired_A_full).data[::-1] + desired_B_banded = dia_object(desired_B_full).data[::-1] A_sparse, B_sparse = misc._high_pass_filter(num_points, freq_cutoff, filter_type, True) A_banded, B_banded = misc._high_pass_filter(num_points, freq_cutoff, filter_type, False) @@ -392,15 +393,15 @@ def beads_data(): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): """ - Check that the lam * (D.T * Lam * D) and A.T * M * A calculations are correct. + Check that the lam * (D.T @ Lam @ D) and A.T @ M @ A calculations are correct. D is the stacked first and second order difference matrices, Lam is a diagonal matrix, - and lam is a scalar. M is the output of Gamma + lam * (D.T * Lam * D), and can let + and lam is a scalar. M is the output of Gamma + lam * (D.T @ Lam @ D), and can let Gamma just be 0 for the test. - The actual calculation for D.T * Lam * D uses just the banded structure, which allows + The actual calculation for D.T @ Lam @ D uses just the banded structure, which allows using arrays rather than having to use and update three separate sparse matrices (the - full calculation is Gamma + D.T * Lam * D, where both Gamma and Lam are sparse matrices + full calculation is Gamma + D.T @ Lam @ D, where both Gamma and Lam are sparse matrices with one diagonal that gets updated each iteration), which is much faster and has no significant effect on memory. @@ -420,23 +421,23 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): d_y = np.concatenate((d1_y, d2_y)) diff_matrix = vstack((diff_1_matrix, diff_2_matrix)) # the full difference matrix, D - # D.T * diags(weight_function(derivative of y)) * D, + # D.T @ diags(weight_function(derivative of y)) @ D, # let weight_function(d_y) just return d_y since it doesn't matter. # the calculation as written in the beads paper (see docstring of beads function for reference) true_calculation = ( - lam_1 * diff_1_matrix.T * diags(d1_y) * diff_1_matrix - + lam_2 * diff_2_matrix.T * diags(d2_y) * diff_2_matrix + lam_1 * diff_1_matrix.T @ diags(d1_y) @ diff_1_matrix + + lam_2 * diff_2_matrix.T @ diags(d2_y) @ diff_2_matrix ) # the calculation as written in the MATLAB beads function, puts lam_1 and lam_2 within Lam - matlab_calculation = diff_matrix.T * diags(lam_12_array * d_y) * diff_matrix + matlab_calculation = diff_matrix.T @ diags(lam_12_array * d_y) @ diff_matrix assert_allclose(true_calculation.toarray(), matlab_calculation.toarray()) # now do the same calculation, using the banded matrices diff_1_banded = np.zeros((5, num_points)) diff_2_banded = np.zeros((5, num_points)) - # D.T * L * D == D_1.T * L_1 * D_1 + D_2.T * L_2 + D_2, so can calculate the + # D.T @ L @ D == D_1.T @ L_1 @ D_1 + D_2.T @ L_2 @ D_2, so can calculate the # individual differences separately d1_y_output, d2_y_output = misc._abs_diff(y) diff_1_banded[1][1:] = diff_1_banded[3][:-1] = -d1_y_output @@ -456,15 +457,17 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): assert_allclose(matlab_calculation.todia().data[::-1], banded_calculation) - # now test calculation of A.T * M * A where A is the D.T * Lam * D results - ATMA_actual = A.T * true_calculation * A + # now test calculation of A.T @ M @ A where A is the D.T @ Lam @ D results + ATMA_actual = A.T @ true_calculation @ A ATMA_actual_bands = ATMA_actual.todia().data[::-1] - sparse_DTD = spdiags(banded_calculation, np.arange(2, -3, -1), num_points, num_points) + sparse_DTD = dia_object( + (banded_calculation, np.arange(2, -3, -1)), shape=(num_points, num_points) + ) - assert_allclose(ATMA_actual.toarray(), (A.T * sparse_DTD * A).toarray()) + assert_allclose(ATMA_actual.toarray(), (A.T @ sparse_DTD @ A).toarray()) # also check without tranposing A since A is symmetric and that's what is used in pybaselines - assert_allclose(ATMA_actual.toarray(), (A * sparse_DTD * A).toarray()) + assert_allclose(ATMA_actual.toarray(), (A @ sparse_DTD @ A).toarray()) # now check banded result; banded calculation also uses A instead of A.T ATMA_banded = misc._banded_dot_banded( @@ -493,7 +496,7 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_BTB(beads_data, filter_type, freq_cutoff): """ - Check that B.T * B calculation is correct for sparse and banded matrices. + Check that B.T @ B calculation is correct for sparse and banded matrices. The calculation used in pybaselines does not use the tranpose of B since it should be symmetric. @@ -504,11 +507,11 @@ def test_beads_BTB(beads_data, filter_type, freq_cutoff): A, B = misc._high_pass_filter(num_points, freq_cutoff, filter_type, True) A_banded, B_banded = misc._high_pass_filter(num_points, freq_cutoff, filter_type, False) - # check that B.T * B is the same as B * B since B is symmetric - actual_BTB = B.T * B + # check that B.T @ B is the same as B @ B since B is symmetric + actual_BTB = B.T @ B actual_BTB_banded = actual_BTB.todia().data[::-1] - assert_allclose(actual_BTB.toarray(), (B * B).toarray()) + assert_allclose(actual_BTB.toarray(), (B @ B).toarray()) banded_BTB = misc._banded_dot_banded( B_banded, B_banded, (filter_type, filter_type), (filter_type, filter_type), @@ -531,7 +534,7 @@ def test_beads_BTB(beads_data, filter_type, freq_cutoff): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_ATb(beads_data, filter_type, freq_cutoff): """ - Check that the lam_0 * A.T * b calculation is correct. + Check that the lam_0 * A.T @ b calculation is correct. The calculation used in pybaselines does not use the tranpose of A since it should be symmetric, and it puts lam_0 into b to skip a multiplication step. @@ -544,11 +547,11 @@ def test_beads_ATb(beads_data, filter_type, freq_cutoff): fill_value = -5 b = np.full(num_points, fill_value) - # first just check A.T * b - ATb_actual = A.T * b + # first just check A.T @ b + ATb_actual = A.T @ b # check that the tranpose is unnessesary since A is symmetric - assert_allclose(ATb_actual, A * b) + assert_allclose(ATb_actual, A @ b) # check the banded solution ATb_banded = misc._banded_dot_vector( @@ -558,14 +561,14 @@ def test_beads_ATb(beads_data, filter_type, freq_cutoff): # use rtol=1.5e-7 with an atol since values are very small for d=2 and small freq_cutoff assert_allclose(ATb_actual, ATb_banded, rtol=1.5e-7, atol=1e-14) - # now check lam_0 * A.T * b - lam_ATb_actual = lam_0 * A.T * b + # now check lam_0 * A.T @ b + lam_ATb_actual = lam_0 * A.T @ b # actual calculation places lam_0 in the vector so that an additional # multiplication step can be skipped b_2 = np.full(num_points, lam_0 * fill_value) - assert_allclose(lam_ATb_actual, A * b_2) + assert_allclose(lam_ATb_actual, A @ b_2) # check the banded solution lam_ATb_banded = misc._banded_dot_vector( diff --git a/tests/test_spline.py b/tests/test_spline.py index 2bb01ca..2d28df2 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -105,7 +105,7 @@ def test_mixture_pdf(fraction_pos, fraction_neg): + fraction_neg * neg_uniform ) - assert_allclose(expected_pdf, output_pdf, 1e-12, 1e-12) + assert_allclose(expected_pdf, output_pdf, rtol=1e-12, atol=1e-12) # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited # by number of x-values @@ -113,7 +113,7 @@ def test_mixture_pdf(fraction_pos, fraction_neg): trapezoid = integrate.trapezoid else: trapezoid = integrate.trapz - assert_allclose(1.0, trapezoid(output_pdf, x), 1e-3) + assert_allclose(1.0, trapezoid(output_pdf, x), rtol=1e-3, atol=1e-10) def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, diff --git a/tests/test_spline_utils.py b/tests/test_spline_utils.py index 85573bb..884b48e 100644 --- a/tests/test_spline_utils.py +++ b/tests/test_spline_utils.py @@ -12,10 +12,11 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.interpolate import BSpline, splev -from scipy.sparse import diags, issparse, spdiags +from scipy.sparse import issparse from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils +from pybaselines._compat import diags, dia_object def _nieve_basis_matrix(x, knots, spline_degree): @@ -230,10 +231,10 @@ def test_solve_psplines(data_fixture, num_knots, spline_degree, diff_order, lowe basis = _spline_utils._spline_basis(x, knots, spline_degree) num_bases = basis.shape[1] penalty = _banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only) - penalty_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(num_bases, diff_order, False), - np.arange(diff_order, -(diff_order + 1), -1), num_bases, num_bases, 'csr' - ) + penalty_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(num_bases, diff_order, False), + np.arange(diff_order, -(diff_order + 1), -1)), shape=(num_bases, num_bases) + ).tocsr() expected_coeffs = spsolve( basis.T @ diags(weights, format='csr') @ basis + penalty_matrix, diff --git a/tests/test_utils.py b/tests/test_utils.py index d5f06d5..2ec23c2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,10 +10,10 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.interpolate import BSpline -from scipy.sparse import diags, identity, spdiags from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils, utils +from pybaselines._compat import diags, dia_object, identity from .conftest import gaussian @@ -795,10 +795,10 @@ def test_pspline_smooth(data_fixture, diff_order, num_knots, spline_degree): knots = _spline_utils._spline_knots(x, num_knots, spline_degree, True) basis = _spline_utils._spline_basis(x, knots, spline_degree) num_bases = basis.shape[1] - penalty_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only=False), - np.arange(diff_order, -(diff_order + 1), -1), num_bases, num_bases, 'csr' - ) + penalty_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only=False), + np.arange(diff_order, -(diff_order + 1), -1)), shape=(num_bases, num_bases) + ).tocsr() weights = diags(np.ones(len_y), format='csr') # solve the simple case for all weights are 1 diff --git a/tests/test_whittaker.py b/tests/test_whittaker.py index 8e9400f..28297b2 100644 --- a/tests/test_whittaker.py +++ b/tests/test_whittaker.py @@ -11,9 +11,9 @@ import numpy as np from numpy.testing import assert_allclose import pytest -from scipy.sparse import diags from pybaselines import _banded_utils, whittaker +from pybaselines._compat import diags from pybaselines.utils import ParameterWarning from .conftest import BaseTester, InputWeightsMixin, has_pentapy diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 3da7060..b61d398 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -9,8 +9,9 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import identity, kron +from scipy.sparse import kron +from pybaselines._compat import identity from pybaselines.two_d import _algorithm_setup, optimizers, polynomial, whittaker from pybaselines.utils import ParameterWarning, difference_matrix diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index c2038a9..9a974bb 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -9,11 +9,12 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import identity, issparse, kron +from scipy.sparse import issparse, kron from scipy.sparse.linalg import spsolve from pybaselines.two_d import _spline_utils from pybaselines.utils import difference_matrix +from pybaselines._compat import identity from ..conftest import get_2dspline_inputs diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 145c1f6..3e20af9 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -9,9 +9,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import identity, issparse, kron +from scipy.sparse import issparse, kron from scipy.sparse.linalg import spsolve +from pybaselines._compat import identity from pybaselines.two_d import _spline_utils, _whittaker_utils from pybaselines.utils import difference_matrix From 4cb72cbf714a7d15f9c6c48d32fc1610b932ee91 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 10 Feb 2024 17:32:20 -0500 Subject: [PATCH 49/56] MAINT: Move all metadata to pyproject and update CI Put all metadata into pyproject.toml, so setup.py and setup.cfg can be removed. Switched from flake8 to ruff and bump2version to bump-my-version. Will need to update pinned requirements once ready to release. Made a separate CI job for linting so that linting can fail but will at least show up now instead of ignoring. --- .bumpversion.cfg | 24 --- .github/PULL_REQUEST_TEMPLATE.md | 9 +- .github/workflows/python-test.yml | 40 ++--- .readthedocs.yaml | 2 +- MANIFEST.in | 17 --- docs/Makefile | 1 - docs/conf.py | 4 +- docs/contributing.rst | 22 +-- docs/make.bat | 1 - pyproject.toml | 160 +++++++++++++++++++- requirements/README.rst | 19 +++ requirements/requirements-development.txt | 8 +- requirements/requirements-documentation.txt | 3 +- setup.cfg | 79 ---------- setup.py | 23 --- 15 files changed, 222 insertions(+), 190 deletions(-) delete mode 100644 .bumpversion.cfg delete mode 100644 MANIFEST.in create mode 100644 requirements/README.rst delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 7b002d2..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,24 +0,0 @@ -[bumpversion] -current_version = 1.0.0 -commit = False -tag = False - -[bumpversion:file:setup.cfg] -search = version = {current_version} -replace = version = {new_version} - -[bumpversion:file:pybaselines/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[bumpversion:file:docs/conf.py] -search = version = '{current_version}' -replace = version = '{new_version}' - -[bumpversion:file:CITATION.cff] -search = version: {current_version} -replace = version: {new_version} - -[bumpversion:file:docs/citing.rst] -search = version = {{{current_version}}} -replace = version = {{{new_version}}} diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 992bd03..fb02457 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -5,9 +5,8 @@ are required and/or what problems they fix. Link or add the issue number for any existing issues that the pull request solves. -Note that unsolicited pull requests will most likely be closed. -Please file an issue first, so that details can be discussed/finalized -before a pull request is created.--> +Note that it is preferred to file an issue first, so that details can be +discussed/finalized before a pull request is created.--> ### Type of Pull Request @@ -24,9 +23,9 @@ before a pull request is created.--> To run tests locally, type the following command within the pybaselines directory: pytest . -To lint files using flake8 to see if they pass PEP 8 standards and that +To lint files using ruff to see if they pass PEP 8 standards and that docstrings are okay, run the following command within the pybaselines -directory: flake8 . +directory: ruff check . To build documentation locally, type the following command within the docs directory: make html diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 5857446..3e43282 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -46,32 +46,16 @@ jobs: python -m pip install --upgrade pip python -m pip install "numpy>=1.18" "scipy>=1.5" pytest - # Only lint a single version; pick a recent, stable version - - name: Install linting dependencies - id: install-linters - if: matrix.python-version == '3.10' - run: | - python -m pip install flake8 flake8-comprehensions flake8-docstrings - - - name: Lint - if: steps.install-linters.outcome == 'success' - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 . --count --exit-zero --statistics - - name: Test with required dependencies run: pytest . - name: Install optional dependencies id: install-optional - # uncomment below in case this step ever needs skipped again + # uncomment below to allow skipping future versions #if: matrix.python-version != '3.13' run: python -m pip install "pentapy>=1.0" "numba>=0.49" - name: Test with optional dependencies - # uncomment below in case this step ever needs skipped again if: steps.install-optional.outcome == 'success' run: pytest . @@ -105,3 +89,25 @@ jobs: - name: Test with minimum optional dependencies run: pytest . + + lint: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.11'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install linting dependencies + run: python -m pip install ruff + + - name: Lint + run: ruff check . --show-source diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ae4b173..1433d70 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.11" # Path to sphinx's configuration file sphinx: diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index e126a92..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,17 +0,0 @@ -include LICENSE.txt -include README.rst -include LICENSES_bundled.txt -include CHANGELOG.rst - -exclude .* - -recursive-exclude .git * -recursive-exclude .github * -recursive-exclude .pytest_cache * -recursive-exclude docs * -recursive-exclude examples * -recursive-exclude tests * -recursive-exclude requirements * -recursive-exclude tools * -recursive-exclude * __pycache__ -recursive-exclude * *.py[cod] \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index dcf792f..9410b27 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,7 +4,6 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx -SPHINXPROJ = mcetl SOURCEDIR = . BUILDDIR = _build diff --git a/docs/conf.py b/docs/conf.py index 523555d..669db3e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,10 +33,10 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ #'sphinx.ext.autodoc', - #'sphinx.ext.autosummary', + #'sphinx.ext.autosummary', # use autoapi instead of autodoc and autosummary 'autoapi.extension', 'sphinx.ext.intersphinx', - #'sphinx.ext.napoleon', + #'sphinx.ext.napoleon', # use numpydoc instead 'numpydoc', 'sphinx.ext.todo', 'sphinx.ext.mathjax', diff --git a/docs/contributing.rst b/docs/contributing.rst index ea6bb37..dc946d2 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -23,8 +23,7 @@ If you are proposing a feature: Pull Requests ~~~~~~~~~~~~~ -Pull requests are welcomed for this project, but please note that -unsolicited pull requests are discouraged. Please file an issue first, +Pull requests are welcomed for this project. Generally, it is preferred to file an issue first, so that details can be discussed/finalized before a pull request is created. Any new code or documentation must be able to be covered by the BSD 3-clause license @@ -35,18 +34,21 @@ When submitting a pull request, follow similar procedures for a feature request, * Explain in detail how it works. * Keep the scope as narrow as possible to make it easier to incorporate. +The following sections will detail how to setup a development environment for contributing +code to pybaselines and all of the potential checks to run. -Set Up Development Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To clone the GitHub repository and install the necessary libraries for development: +Setting Up Development Environment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To clone the GitHub repository and install the necessary libraries for development, +ensure `git `_ is installed and then run: .. code-block:: console git clone https://github.com/derb12/pybaselines.git cd pybaselines - pip install -r requirements/requirements-development.txt - pip install -e . + pip install -e .[dev] All sections below assume the above commands were ran. @@ -65,13 +67,13 @@ terminal while in the pybaselines directory: .. code-block:: console - flake8 . --statistics + ruff check . Testing ^^^^^^^ -If implementing a new feature, please add any necessary tests. To check that tests pass +If adding new code, please add any necessary tests. To check that tests pass locally, run the following command in the terminal while in the pybaselines directory: .. code-block:: console @@ -103,7 +105,7 @@ Documentation If submitting changes to the documentation or adding documentation for a new feature/algorithm, please ensure the documentation builds locally by running the following command while in the -docs directory: +``pybaselines/docs`` directory: .. code-block:: console diff --git a/docs/make.bat b/docs/make.bat index ef0cc78..7d2f9a1 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -9,7 +9,6 @@ if "%SPHINXBUILD%" == "" ( ) set SOURCEDIR=. set BUILDDIR=_build -set SPHINXPROJ=mcetl if "%1" == "" goto help diff --git a/pyproject.toml b/pyproject.toml index 7b4d62a..01a204a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,90 @@ [build-system] -# setuptools v42.0.0 was first version to allow multiple license files -# (the license_files metadata field in setup.cfg). Also covers pep-517 -# and pep-518 since support was added to setuptools in v40.8.0. -requires = ["setuptools>=42", "wheel"] +# setuptools v61.2.0 was first version to allow using the project section to specify metadata. +requires = ["setuptools>=61.2"] build-backend = "setuptools.build_meta" +[project] +name = "pybaselines" +version = "1.0.0" +authors = [ + {name = "Donald Erb", email = "donnie.erb@gmail.com"}, +] +description = "A library of algorithms for the baseline correction of experimental data." +readme = "README.rst" +license = {file = "LICENSE.txt"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Physics", +] +keywords = [ + "materials characterization", + "materials science", + "baseline", + "background", + "baseline correction", + "baseline subtraction", + "chemistry", + "spectroscopy", + "raman", +] +requires-python = ">=3.8" +dependencies = [ + # lowest versions supported for python 3.8 + "numpy>=1.18", + "scipy>=1.5", +] + +[project.urls] +Homepage = "https://github.com/derb12/pybaselines" +Documentation = "https://pybaselines.readthedocs.io" + +[project.optional-dependencies] +full = [ + "pentapy>=1.0", # first version with PTRANS-II solver and MIT license + "numba>=0.49", # first to allow usage with python 3.8 +] +test = [ + "pytest", + "ruff", +] +docs = [ + "sphinx", + "sphinx-rtd-theme", + "sphinx-autoapi", + "sphinx-gallery", + "matplotlib", + "numpydoc", +] +release = [ + "build", + "bump-my-version", + "twine", +] +dev = ["pybaselines[full, docs, test, release]"] + +[tool.setuptools] +# TODO license-files usage may change in the future once PEP 639 is accepted +license-files = [ + "LICENSE.txt", + "LICENSES_bundled.txt", +] + +[tool.setuptools.packages.find] +include = ["pybaselines", "pybaselines.*"] + [tool.isort] skip = "pybaselines/__init__.py" skip_glob = ["docs/*"] @@ -15,3 +95,75 @@ multi_line_output = 5 src_paths = ["pybaselines", "tests"] # example_helpers are locally used in doc examples known_local_folder = ["example_helpers"] + +[tool.ruff] +exclude = ["docs/*"] +ignore = [ + "D401", # D401 first line should be in imperative mood; try rephrasing + "E731", # E731 do not assign a lambda expression, use a def +] +line-length = 100 +task-tags = ["TODO"] +select = [ + "B", # flake8-bugbear + "D", + "E", # pycodestyle errors + "F", # pyflakes + #"I", # isort + "W", # pycodestyle warnings +] + +[tool.ruff.lint.pycodestyle] +ignore-overlong-task-comments = true + +[tool.ruff.pydocstyle] +convention = "numpy" + +[tool.ruff.per-file-ignores] +"__init__.py" = [ + "F401", # F401: module imported but unused + "D205", # D205: 1 blank line required between summary line and description + +] +"examples/*" = [ + "B007", # B007: Loop control variable `name` not used within loop body; want to be explicit in examples + "D205", # D205: 1 blank line required between summary line and description + "D400", # D400: first line should end with a period +] +"tests/*" = [ + "F841", # F841: Local variable 'name' is assigned to but never used; want to be explicit within tests +] + +[tool.ruff.lint] +preview = true # for using experimental rules + +[tool.bumpversion] +current_version = "1.0.0" +commit = false +tag = false +message = "Bump version: {current_version} -> {new_version}" + +[[tool.bumpversion.files]] +filename = "pyproject.toml" +search = "version = \"{current_version}\"" +replace = "version = \"{new_version}\"" + +[[tool.bumpversion.files]] +filename = "pybaselines/__init__.py" +search = "__version__ = '{current_version}'" +replace = "__version__ = '{new_version}'" + +[[tool.bumpversion.files]] +filename = "docs/conf.py" +search = "version = '{current_version}'" +replace = "version = '{new_version}'" + +[[tool.bumpversion.files]] +filename = "CITATION.cff" +search = "version: {current_version}" +replace = "version: {new_version}" + +[[tool.bumpversion.files]] +filename = "docs/citing.rst" +search = "version = {{{current_version}}}" +replace = "version = {{{new_version}}}" diff --git a/requirements/README.rst b/requirements/README.rst new file mode 100644 index 0000000..e6f0366 --- /dev/null +++ b/requirements/README.rst @@ -0,0 +1,19 @@ +=================== +Pinned Requirements +=================== + +The requirements in this folder are pinned to specific versions to allow recreating +a specific build. This is useful in cases such as building documentation on readthedocs +or for debugging since this specific build is known to work on both Windows and Linux +with python 3.11. + +If you only want to install all of the development dependencies for pybaselines, it is +recommended to instead use: + +.. code-block:: console + + pip install pybaselines[dev] + +or install an editable version by following the +`installation guide `_ +in the documentation. diff --git a/requirements/requirements-development.txt b/requirements/requirements-development.txt index 982c9e1..db6939b 100644 --- a/requirements/requirements-development.txt +++ b/requirements/requirements-development.txt @@ -2,14 +2,12 @@ -r requirements-documentation.txt # for linting -flake8==6.0.0 -flake8-comprehensions==3.7.0 -flake8-docstrings==1.6.0 +ruff # for testing pytest==6.2.5 # for creating releases -bump2version==1.0.1 +bump-my-version twine==3.6.0 -wheel==0.37.0 +build diff --git a/requirements/requirements-documentation.txt b/requirements/requirements-documentation.txt index a7df4af..bc1fe50 100644 --- a/requirements/requirements-documentation.txt +++ b/requirements/requirements-documentation.txt @@ -8,4 +8,5 @@ sphinx-autoapi==1.8.4 sphinx-gallery==0.10.1 matplotlib==3.3.3 pentapy==1.1.2 -numba==0.54.1 +numba==0.56.0 +numpydoc \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 0cfcf05..0000000 --- a/setup.cfg +++ /dev/null @@ -1,79 +0,0 @@ -[metadata] -name = pybaselines -version = 1.0.0 -author = Donald Erb -author_email = donnie.erb@gmail.com -description = A library of algorithms for the baseline correction of experimental data. -long_description = file: README.rst -long_description_content_type = text/x-rst -license = BSD-3-Clause -license_files = - LICENSE.txt - LICENSES_bundled.txt -classifiers = - Development Status :: 5 - Production/Stable - Intended Audience :: Science/Research - Intended Audience :: Developers - License :: OSI Approved :: BSD License - Operating System :: OS Independent - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Topic :: Scientific/Engineering - Topic :: Scientific/Engineering :: Chemistry - Topic :: Scientific/Engineering :: Information Analysis - Topic :: Scientific/Engineering :: Physics -keywords = - materials characterization - materials science - baseline - background - baseline correction - baseline subtraction - chemistry - spectroscopy -url = https://github.com/derb12/pybaselines -project_urls = - Source Code = https://github.com/derb12/pybaselines - Documentation = https://pybaselines.readthedocs.io - -[options] -packages = find: -include_package_data = True -python_requires = >=3.8 -install_requires = - numpy>=1.18 - scipy>=1.5 -zip_safe = False - -[options.extras_require] -full = - pentapy>=1.0 - numba>=0.49 - -[options.packages.find] -include = pybaselines, pybaselines.* - -[flake8] -max-line-length = 100 -docstring-convention = numpy -exclude = - docs/* -ignore = - # E731 do not assign a lambda expression, use a def - # W503 line break before binary operator - # W504 line break after binary operator - # D401 first line should be in imperative mood; try rephrasing - E731, - W503, - W504, - D401 -per-file-ignores = - # F401: module imported but unused - # D205: 1 blank line required between summary line and description - __init__.py: F401, D205 - # D400: first line should end with a period - examples/*: D205, D400 diff --git a/setup.py b/setup.py deleted file mode 100644 index 11cc841..0000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""The setup script. - -All metadata now exists in setup.cfg. setup.py is now only needed to allow -for editable installs when using older versions of pip. - - -Notes on minimum required versions for dependencies: - -numpy: >= 1.17 in order to use numpy.random.default_rng -scipy: >= 1.0 to use the blas function gbmv for banded matrix-vector dot product -pentapy: >= 1.0 to use solver #2 -numba: >= 0.45 in order to cache jit-ed functions with parallel=True - -""" - -from setuptools import setup - - -if __name__ == '__main__': - - setup() From 6d23edc1ab261ff1175f9d56a2de67f2b45f6cd3 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 10 Feb 2024 18:05:56 -0500 Subject: [PATCH 50/56] TEST: Add CI for testing against nightly numpy and scipy Ensures that any new changes in numpy or scipy will be caught early. Fixed one last place where numpy.trapz was used. Side note: pybaselines works with numpy 2.0, which is a huge relief. --- .github/workflows/python-test-latest.yml | 54 ++++++++++++++++++++++++ tests/test_spline.py | 7 +-- tests/two_d/test_spline.py | 5 ++- 3 files changed, 58 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/python-test-latest.yml diff --git a/.github/workflows/python-test-latest.yml b/.github/workflows/python-test-latest.yml new file mode 100644 index 0000000..d62bcee --- /dev/null +++ b/.github/workflows/python-test-latest.yml @@ -0,0 +1,54 @@ +# For testing the nightly builds of numpy and scipy so that any new changes will not be +# a surprise. + +# Will only trigger if there is a change within pybaselines or tests directories. + +name: test-latest-dependencies + +on: + # allow manually activating the workflow + workflow_dispatch: + + push: + branches: [ main ] + paths: + - 'pybaselines/**' + - 'tests/**' + - '.github/workflows/**' + + pull_request: + # always trigger on a pull request, regardless of the branch + paths: + - 'pybaselines/**' + - 'tests/**' + - '.github/workflows/**' + +jobs: + test-nightly: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # Choose the latest stable python version + python-version: ['3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install required dependencies + run: | + python -m pip install --upgrade pip + python -m pip install pytest + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scipy + + - name: Test with required dependencies + # use -Werror so that any warnings will show up as errors -> want to be as stringent + # as possible + run: python -Werror -m pytest . diff --git a/tests/test_spline.py b/tests/test_spline.py index 2d28df2..c701e3f 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -11,9 +11,9 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy import integrate from pybaselines import _banded_utils, morphological, spline, utils, whittaker +from pybaselines._compat import trapezoid from .conftest import BaseTester, InputWeightsMixin @@ -108,11 +108,6 @@ def test_mixture_pdf(fraction_pos, fraction_neg): assert_allclose(expected_pdf, output_pdf, rtol=1e-12, atol=1e-12) # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited # by number of x-values - - if hasattr(integrate, 'trapezoid'): - trapezoid = integrate.trapezoid - else: - trapezoid = integrate.trapz assert_allclose(1.0, trapezoid(output_pdf, x), rtol=1e-3, atol=1e-10) diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py index f2aa74a..ed05ae9 100644 --- a/tests/two_d/test_spline.py +++ b/tests/two_d/test_spline.py @@ -14,6 +14,7 @@ from pybaselines import utils from pybaselines.two_d import spline, whittaker +from pybaselines._compat import trapezoid from ..conftest import BaseTester2D, InputWeightsMixin @@ -105,10 +106,10 @@ def test_mixture_pdf(fraction_pos, fraction_neg): + fraction_neg * neg_uniform ) - assert_allclose(expected_pdf, output_pdf, 1e-12, 1e-12) + assert_allclose(expected_pdf, output_pdf, rtol=1e-12, atol=1e-12) # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited # by number of x-values - assert_allclose(1.0, np.trapz(output_pdf, x), 1e-3) + assert_allclose(1.0, trapezoid(output_pdf, x), rtol=1e-3) def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, From 9c0c7281efc40905fabdfb02889ac9e7e621eddc Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 11 Feb 2024 12:53:18 -0500 Subject: [PATCH 51/56] MAINT: Extend _check_scalar_variable to allow 2d inputs Simplifies the checking of 2d variables. Also added checks to ensure polynomial orders are never non-negative. --- pybaselines/_algorithm_setup.py | 10 +- pybaselines/_validation.py | 70 ++++++++------ pybaselines/two_d/_algorithm_setup.py | 27 +++--- pybaselines/two_d/_spline_utils.py | 21 +++-- pybaselines/two_d/_whittaker_utils.py | 59 +++++++++--- tests/test_algorithm_setup.py | 18 ++++ tests/test_validation.py | 128 +++++++++++++++++++++++++- tests/two_d/test_algorithm_setup.py | 65 ++++++++++++- 8 files changed, 331 insertions(+), 67 deletions(-) diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 88105c3..0f9f247 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -23,7 +23,8 @@ from ._banded_utils import PenalizedSystem from ._spline_utils import PSpline from ._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays + _check_array, _check_half_window, _check_optional_array, _check_scalar_variable, + _check_sized_array, _yx_arrays ) from .utils import ( ParameterWarning, _determine_sorts, _inverted_sort, _sort_array, optimize_window, pad_edges @@ -208,8 +209,8 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). skip_sorting : bool, optional - If True, will skip sorting the inputs and outputs, which is useful for algorithms that use - other algorithms so that sorting is already internally done. Default is False. + If True, will skip sorting the inputs and outputs, which is useful for algorithms that + use other algorithms so that sorting is already internally done. Default is False. Returns ------- @@ -455,6 +456,9 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] + poly_order = _check_scalar_variable( + poly_order, allow_zero=True, variable_name='polynomial order', dtype=int + ) if calc_vander: if self.vandermonde is None or poly_order > self.poly_order: diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index b322e2d..0ecb89e 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -71,25 +71,29 @@ def _check_scalar(data, desired_length, fill_scalar=False, coerce_0d=True, **asa return output, is_scalar -def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarray_kwargs): +def _check_scalar_variable(value, allow_zero=False, variable_name='lam', two_d=False, + **asarray_kwargs): """ Ensures the input is a scalar value. Parameters ---------- - value : float or array-like + value : numpy.Number or array-like The value to check. allow_zero : bool, optional If False (default), only allows `value` > 0. If True, allows `value` >= 0. variable_name : str, optional The name displayed if an error occurs. Default is 'lam'. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. **asarray_kwargs : dict Additional keyword arguments to pass to :func:`numpy.asarray`. Returns ------- - output : float - The verified scalar value. + output : numpy.Number or numpy.ndarray[numpy.Number, numpy.Number] + The verified scalar value(s). Raises ------ @@ -98,7 +102,13 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr less than 0 if `allow_zero` is True. """ - output = _check_scalar(value, 1, fill_scalar=False, **asarray_kwargs)[0] + if two_d: + desired_length = 2 + fill_scalar = True + else: + desired_length = 1 + fill_scalar = False + output = _check_scalar(value, desired_length, fill_scalar=fill_scalar, **asarray_kwargs)[0] if allow_zero: operation = np.less text = 'greater than or equal to' @@ -108,7 +118,6 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr if np.any(operation(output, 0)): raise ValueError(f'{variable_name} must be {text} 0') - # use an empty tuple to get the single scalar value return output @@ -132,6 +141,12 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr ensure_1d : bool, optional If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + ensure_2d : bool, optional + If True, will raise an error if `array` is not a two dimensional array or a three + dimensional array with shape (M, N, 1), (1, M, N), or (M, 1, N). Default is False. + two_d : bool, optional + If True, will raise an error if the shape of `array` is not a two dimensional array with + shape (M, N) where M or N must be greater than 1. Returns ------- @@ -147,7 +162,8 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr Notes ----- If `ensure_1d` is True and `array` has a shape of (N, 1) or (1, N), it is reshaped to - (N,) for better compatibility for all functions. + (N,) for better compatibility for all functions. Likewise, `ensure_2d` will flatten to + (M, N). """ if check_finite: @@ -176,6 +192,8 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr output = output.reshape(output_shape[flat_dims]).shape elif dimensions != 2: raise ValueError('must be a two dimensional array') + elif ensure_2d and not two_d: + raise ValueError('two_d must be True if using ensure_2d') return output @@ -350,7 +368,7 @@ def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, return y, x, z -def _check_lam(lam, allow_zero=False, dtype=float): +def _check_lam(lam, allow_zero=False, two_d=False, dtype=float): """ Ensures the regularization parameter `lam` is a scalar greater than 0. @@ -361,13 +379,16 @@ def _check_lam(lam, allow_zero=False, dtype=float): penalized splines. allow_zero : bool If False (default), only allows `lam` values > 0. If True, allows `lam` >= 0. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. dtype : type or numpy.dtype, optional The dtype to cast the lam value. Default is float. Returns ------- - float - The scalar `lam` value. + numpy.Number or numpy.ndarray[numpy.Number, numpy.Number] + The verified `lam` value(s). Raises ------ @@ -394,7 +415,7 @@ def _check_lam(lam, allow_zero=False, dtype=float): ``(diags(lam) @ D.T @ D).todia().data[::-1]``. """ - return _check_scalar_variable(lam, allow_zero, dtype=dtype) + return _check_scalar_variable(lam, allow_zero, two_d=two_d, variable_name='lam', dtype=dtype) def _check_half_window(half_window, allow_zero=False, two_d=False): @@ -410,11 +431,14 @@ def _check_half_window(half_window, allow_zero=False, two_d=False): allow_zero : bool, optional If True, allows `half_window` to be 0; otherwise, `half_window` must be at least 1. Default is False. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. Returns ------- - output_half_window : int - The verified half-window value. + output_half_window : int or numpy.ndarray[int, int] + The verified half-window value(s). Raises ------ @@ -423,18 +447,11 @@ def _check_half_window(half_window, allow_zero=False, two_d=False): `half_window`. """ - if two_d: - output_half_window = _check_scalar( - half_window, 2, fill_scalar=True, dtype=np.intp - )[0] - for val in output_half_window: - _check_scalar_variable(val, allow_zero, 'half_window') - else: - output_half_window = _check_scalar_variable( - half_window, allow_zero, 'half_window', dtype=np.intp - ) - if output_half_window != half_window: - raise TypeError('half_window must be an integer') + output_half_window = _check_scalar_variable( + half_window, allow_zero, variable_name='half_window', two_d=two_d, dtype=np.intp + ) + if not two_d and output_half_window != half_window: + raise TypeError('half_window must be an integer') return output_half_window @@ -496,7 +513,8 @@ def _get_row_col_values(value, **asarray_kwargs): Parameters ---------- value : numpy.number or Sequence[numpy.number, ...] - _description_ + The value(s) corresponding to the first row, last row, first column, and last + column. Returns ------- diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index ff197bd..baa07c5 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -19,7 +19,7 @@ ) from ._spline_utils import PSpline2D from .._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_scalar, _check_scalar_variable, + _check_array, _check_half_window, _check_optional_array, _check_scalar_variable, _check_sized_array, _yxz_arrays ) from ._whittaker_utils import WhittakerSystem2D @@ -432,12 +432,10 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa Raised if `diff_order` is greater than 3. """ - diff_order = _check_scalar(diff_order, 2, True)[0] - if (diff_order < 1).any(): - raise ValueError( - 'the difference order must be > 0 for Whittaker-smoothing-based methods' - ) - elif (diff_order > 3).any(): + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + if (diff_order > 3).any(): warnings.warn( ('difference orders greater than 3 can have numerical issues;' ' consider using a difference order of 2 or 1 instead'), @@ -450,7 +448,10 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] - if self.whittaker_system is not None and self.whittaker_system.same_basis(diff_order, eigenvalues): + if ( + self.whittaker_system is not None + and self.whittaker_system.same_basis(diff_order, eigenvalues) + ): self.whittaker_system.update_penalty(lam) else: self.whittaker_system = WhittakerSystem2D( @@ -526,10 +527,12 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] weight_array = weight_array.ravel() - poly_orders = _check_scalar(poly_order, 2, True)[0] + poly_orders = _check_scalar_variable( + poly_order, allow_zero=True, variable_name='polynomial order', two_d=True, dtype=int + ) if max_cross is not None: max_cross = _check_scalar_variable( - max_cross, allow_zero=True, variable_name='max_cross' + max_cross, allow_zero=True, variable_name='max_cross', dtype=int ) if calc_vander: if ( @@ -637,7 +640,9 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] - diff_order = _check_scalar(diff_order, 2, True)[0] + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) if make_basis: if (diff_order > 4).any(): warnings.warn( diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 4bba4ea..5a61836 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -12,7 +12,7 @@ from .._compat import csr_object from .._spline_utils import _spline_basis, _spline_knots -from .._validation import _check_array, _check_scalar +from .._validation import _check_array, _check_scalar_variable from ._whittaker_utils import PenalizedSystem2D @@ -109,11 +109,12 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam self.x = _check_array(x, dtype=float, check_finite=check_finite, ensure_1d=True) self.z = _check_array(z, dtype=float, check_finite=check_finite, ensure_1d=True) - self.num_knots = _check_scalar(num_knots, 2, True)[0] - self.spline_degree = _check_scalar(spline_degree, 2, True)[0] - - if (self.spline_degree < 0).any(): - raise ValueError('spline degree must be >= 0') + self.num_knots = _check_scalar_variable( + num_knots, allow_zero=False, variable_name='number of knots', two_d=True, dtype=int + ) + self.spline_degree = _check_scalar_variable( + spline_degree, allow_zero=True, variable_name='spline degree', two_d=True, dtype=int + ) self.knots_r = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) self.basis_r = _spline_basis(self.x, self.knots_r, self.spline_degree[0]) @@ -154,8 +155,12 @@ def same_basis(self, num_knots=100, spline_degree=3): """ # TODO should give a way to update only one of the basis functions, which # would also need to update the penalty - num_knots = _check_scalar(num_knots, 2, True)[0] - spline_degree = _check_scalar(spline_degree, 2, True)[0] + num_knots = _check_scalar_variable( + num_knots, allow_zero=False, variable_name='number of knots', two_d=True, dtype=int + ) + spline_degree = _check_scalar_variable( + spline_degree, allow_zero=True, variable_name='spline degree', two_d=True, dtype=int + ) return ( np.array_equal(num_knots, self.num_knots) diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 322a482..55638cf 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -13,7 +13,7 @@ from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix from .._compat import identity -from .._validation import _check_lam, _check_scalar +from .._validation import _check_lam, _check_scalar_variable class PenalizedSystem2D: @@ -114,10 +114,10 @@ def reset_diagonals(self, lam=1, diff_order=2): Default is 2 (second order difference). """ - self.diff_order = _check_scalar(diff_order, 2, True)[0] - self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - if (self.diff_order < 1).any(): - raise ValueError('the difference order must be > 0') + self.diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + self.lam = _check_lam(lam, two_d=True) penalty_rows = diff_penalty_matrix(self._num_bases[0], self.diff_order[0]) penalty_columns = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) @@ -244,7 +244,6 @@ def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): self.coef = None self._basis = None self._num_points = data_size - self.diff_order = _check_scalar(diff_order, 2, True)[0] if max_eigens is None or None in max_eigens: self._num_bases = data_size self._using_svd = False @@ -252,7 +251,9 @@ def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): # TODO need to check to ensure max_eigens is <= data_size and otherwise emit # an error; if max_eigens is >~ 40 should emit an error saying too many # also check that it is greater than 0 or maybe 1 - self._num_bases = _check_scalar(max_eigens, 2, True, dtype=int)[0] + self._num_bases = _check_scalar_variable( + max_eigens, allow_zero=False, variable_name='eigenvalues', two_d=True, dtype=int + ) self._using_svd = True self.reset_diagonals(lam, diff_order) @@ -284,10 +285,10 @@ def reset_diagonals(self, lam=1, diff_order=2): super().reset_diagonals(lam, diff_order) return - self.lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] - self.diff_order = _check_scalar(diff_order, 2, True)[0] - if (self.diff_order < 1).any(): - raise ValueError('the difference order must be > 0') + self.diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + self.lam = _check_lam(lam, two_d=True) # initially need num_bases to point to the data shape; maybe set a second # attribute insteaad @@ -317,6 +318,31 @@ def reset_diagonals(self, lam=1, diff_order=2): self.basis_c = vectors_columns def _calc_eigenvalues(self, data_points, diff_order, num_eigens): + """ + Calculate the eigenvalues and eigenvectors for the corresponding penalty matrix. + + Parameters + ---------- + data_points : _type_ + _description_ + diff_order : _type_ + _description_ + num_eigens : int + The + + Returns + ------- + eigenvalues : np.ndarray, shape (num_eigens,) + The eigenvalues of the penalty matrix for the corresponding difference order. + eigenvectors : np.ndarray, shape (data_points, num_eigens) + The eigenvectors for the penalty matrix. + + Notes + ----- + The penalty matrix has a matrix rank (number of nonzero eigenvalues) of + ``data_points - num_eigens``. + + """ # TODO the lowest diff_order eigenvalues should be zero, while they end up being # ~ +- 1e-15, will this affect any calculations or can it be left as it? If it does # need set to 0, do the eigenvectors likewise need updated for that? @@ -336,7 +362,7 @@ def _calc_eigenvalues(self, data_points, diff_order, num_eigens): def update_penalty(self, lam): if not self._using_svd: raise ValueError('Must call reset_diagonals if not using eigendecomposition') - lam = [_check_lam(val) for val in _check_scalar(lam, 2, True)[0]] + lam = _check_lam(lam, two_d=True) self.penalty_rows = (lam[0] / self.lam[0]) * self.penalty_rows self.penalty_columns = (lam[1] / self.lam[1]) * self.penalty_columns @@ -373,8 +399,13 @@ def same_basis(self, diff_order=2, max_eigens=None): if max_eigens is None or not self._using_svd: return False - max_eigens = _check_scalar(max_eigens, 2, True)[0] - diff_order = _check_scalar(diff_order, 2, True)[0] + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + + max_eigens = _check_scalar_variable( + max_eigens, allow_zero=False, variable_name='eigenvalues', two_d=True, dtype=int + ) return ( np.array_equal(diff_order, self.diff_order) and np.array_equal(max_eigens, self._num_bases) diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index 003de3a..c2f4fa2 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -202,6 +202,24 @@ def test_setup_polynomial_vandermonde(small_data, algorithm, vander_enum, includ assert_allclose(desired_pinv, pinv_matrix, 1e-10) +def test_setup_polynomial_negative_polyorder_fails(small_data, algorithm): + """Ensures a negative poly_order raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=-1) + + +def test_setup_polynomial_too_large_polyorder_fails(small_data, algorithm): + """Ensures an exception is raised if poly_order has more than one value.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=[1, 2]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=np.array([1, 2])) + + def test_setup_smooth_shape(small_data, algorithm): """Ensures output y is correctly padded.""" pad_length = 4 diff --git a/tests/test_validation.py b/tests/test_validation.py index c0712c5..fd9caf1 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -209,11 +209,49 @@ def test_check_scalar_length_none(): _validation._check_scalar(data, desired_length=10000) +def test_check_scalar_variable_single(): + """Ensures _check_scalar_variable returns a float value for the simple 1d case.""" + value = 3.2 + + output = _validation._check_scalar_variable(value) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value]) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value])) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + +def test_check_scalar_variable_twod(): + """Ensures _check_scalar_variable returns a length 2 numpy array for the simple 2d case.""" + value = 3.2 + expected_output = np.array([value, value]) + + output = _validation._check_scalar_variable(value, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value], two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value]), two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value, value], two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value, value]), two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + @pytest.mark.parametrize('lam', (5, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]))) def test_check_lam(lam): """Ensures scalar lam values are correctly processed.""" output_lam = _validation._check_lam(lam) - assert output_lam == 5 + assert_allclose(output_lam, 5, rtol=0, atol=1e-14) def test_check_lam_failures(): @@ -228,10 +266,50 @@ def test_check_lam_failures(): _validation._check_lam(lam) # test that is allows zero if allow_zero is True - _validation._check_lam(0, True) + _validation._check_lam(0, allow_zero=True) for lam in range(-5, 0): with pytest.raises(ValueError): - _validation._check_lam(lam, True) + _validation._check_lam(lam, allow_zero=True) + + +@pytest.mark.parametrize( + 'lam', ( + 5, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]), + [5, 5], np.array([5, 5]) + ) + ) +def test_check_lam_twod(lam): + """Ensures scalar lam values are correctly processed.""" + output_lam = _validation._check_lam(lam, two_d=True) + assert_allclose(output_lam, np.array([5, 5]), rtol=0, atol=1e-14) + + +def test_check_lam_twod_allow_zero(): + """Ensures _check_lam allows zero for two dimensional inputs when allowed.""" + expected_output = np.array([0, 0]) + + output = _validation._check_lam(0, allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_lam([0, 0], allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + +@pytest.mark.parametrize('allow_zero', (True, False)) +def test_check_lam_twod_negative_failures(allow_zero): + """Ensures _check_lam works fails for larger than two dimensional inputs.""" + max_val = 0 if allow_zero else 1 + + # check scalar inputs + for lam in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_lam(lam, allow_zero=allow_zero, two_d=True) + + # check array-like inputs + for lam_1 in range(-5, max_val): + for lam_2 in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_lam([lam_1, lam_2], allow_zero=allow_zero, two_d=True) @pytest.mark.parametrize( @@ -259,12 +337,54 @@ def test_check_half_window_failures(): _validation._check_half_window(0, True) for half_window in range(-5, 0): with pytest.raises(ValueError): - _validation._check_half_window(half_window, True) + _validation._check_half_window(half_window, allow_zero=True) # fails due to non-integer input with pytest.raises(TypeError): _validation._check_half_window(5.01) +@pytest.mark.parametrize( + 'half_window', ( + 5, 5.0, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]), + np.array([5, 5]), [5, 5], [5.0, 5.0] + ) +) +def test_check_half_window_twod(half_window): + """Ensures _check_half_window works for two dimensional inputs when allowed.""" + output_half_window = _validation._check_half_window(half_window, two_d=True) + assert_allclose(output_half_window, np.array([5, 5], dtype=np.intp)) + assert output_half_window.dtype == np.intp + + +def test_check_half_window_twod_allow_zero(): + """Ensures _check_half_window allows zero for two dimensional inputs when allowed.""" + expected_output = np.array([0, 0], dtype=np.intp) + + output = _validation._check_half_window(0, allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_half_window([0, 0], allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + +@pytest.mark.parametrize('allow_zero', (True, False)) +def test_check_half_window_twod_negative_failures(allow_zero): + """Ensures _check_half_window works fails for larger than two dimensional inputs.""" + max_val = 0 if allow_zero else 1 + + # check scalar inputs + for half_window in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_half_window(half_window, allow_zero=allow_zero, two_d=True) + + # check array-like inputs + for half_window_1 in range(-5, max_val): + for half_window_2 in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_half_window( + [half_window_1, half_window_2], allow_zero=allow_zero, two_d=True + ) + @pytest.mark.parametrize('list_input', (True, False)) def test_check_array_dtype(small_data, list_input): diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index b61d398..c472039 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -166,7 +166,7 @@ def test_setup_polynomial_wrong_weight_shape(small_data2d, algorithm): algorithm._setup_polynomial(small_data2d, weights=weights) -@pytest.mark.parametrize('poly_order', (2, 4, (2, 4))) +@pytest.mark.parametrize('poly_order', (0, 2, 4, (2, 4))) @pytest.mark.parametrize('vander_enum', (0, 1, 2, 3)) @pytest.mark.parametrize('include_pinv', (True, False)) def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, include_pinv, @@ -212,6 +212,69 @@ def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, incl assert_allclose(desired_pinv, pinv_matrix, 1e-10) +def test_setup_polynomial_negative_polyorder_fails(small_data2d, algorithm): + """Ensures a negative poly_order raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=-1) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, -1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[-1, 1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[-1, -1]) + + +def test_setup_polynomial_too_large_polyorder_fails(small_data2d, algorithm): + """Ensures an exception is raised if poly_order has more than two values.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, 2, 3, 4]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=np.array([1, 2, 3])) + + +def test_setup_polynomial_maxcross(small_data2d, algorithm): + """Ensures the _max_cross attribute is updated after calling _setup_polynomial.""" + algorithm._setup_polynomial(small_data2d, max_cross=[1]) + assert algorithm._max_cross == 1 + + algorithm._setup_polynomial(small_data2d, max_cross=1) + assert algorithm._max_cross == 1 + + algorithm._setup_polynomial(small_data2d, max_cross=0) + assert algorithm._max_cross == 0 + + algorithm._setup_polynomial(small_data2d, max_cross=None) + assert algorithm._max_cross is None + + +def test_setup_polynomial_too_large_maxcross_fails(small_data2d, algorithm): + """Ensures an exception is raised if max_cross has more than one value.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[1, 2]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=np.array([1, 2])) + + +def test_setup_polynomial_negative_maxcross_fails(small_data2d, algorithm): + """Ensures an exception is raised if max_cross is negative.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[-1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=-2) + + def test_setup_smooth_shape(small_data2d, algorithm): """Ensures output y is correctly padded.""" pad_length = 4 From cb3e18acba21402f91454b0156f2a9d5912e809d Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 11 Feb 2024 15:08:11 -0500 Subject: [PATCH 52/56] MAINT: Update contributor guide No longer mention editable installs in the contribution guide since it requires passing additional options to setuptools in order to work, which could be confusing for new contributors. Update min setuptools for building to allow editable installs, and update ruff settings. --- docs/contributing.rst | 2 +- pyproject.toml | 24 +++++++++++++----------- requirements/README.rst | 2 +- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/contributing.rst b/docs/contributing.rst index dc946d2..c200ee4 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -48,7 +48,7 @@ ensure `git `_ is installed and then run: git clone https://github.com/derb12/pybaselines.git cd pybaselines - pip install -e .[dev] + pip install .[dev] All sections below assume the above commands were ran. diff --git a/pyproject.toml b/pyproject.toml index 01a204a..df0085b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] -# setuptools v61.2.0 was first version to allow using the project section to specify metadata. -requires = ["setuptools>=61.2"] +# setuptools v64.0.0 was first version to allow creating editable installs with only pyproject.toml +requires = ["setuptools>=64"] build-backend = "setuptools.build_meta" [project] @@ -98,12 +98,11 @@ known_local_folder = ["example_helpers"] [tool.ruff] exclude = ["docs/*"] -ignore = [ - "D401", # D401 first line should be in imperative mood; try rephrasing - "E731", # E731 do not assign a lambda expression, use a def -] line-length = 100 -task-tags = ["TODO"] +fix = false + +[tool.ruff.lint] +preview = true # for using experimental rules select = [ "B", # flake8-bugbear "D", @@ -112,14 +111,19 @@ select = [ #"I", # isort "W", # pycodestyle warnings ] +ignore = [ + "D401", # D401 first line should be in imperative mood; try rephrasing + "E731", # E731 do not assign a lambda expression, use a def +] +task-tags = ["TODO"] [tool.ruff.lint.pycodestyle] ignore-overlong-task-comments = true -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "numpy" -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", # F401: module imported but unused "D205", # D205: 1 blank line required between summary line and description @@ -134,8 +138,6 @@ convention = "numpy" "F841", # F841: Local variable 'name' is assigned to but never used; want to be explicit within tests ] -[tool.ruff.lint] -preview = true # for using experimental rules [tool.bumpversion] current_version = "1.0.0" diff --git a/requirements/README.rst b/requirements/README.rst index e6f0366..350a77c 100644 --- a/requirements/README.rst +++ b/requirements/README.rst @@ -14,6 +14,6 @@ recommended to instead use: pip install pybaselines[dev] -or install an editable version by following the +or clone the repository by following the `installation guide `_ in the documentation. From e200d59b71df58eba2f6fae54abcf8b5a2156397 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sun, 11 Feb 2024 19:11:53 -0500 Subject: [PATCH 53/56] DOCS: Finish the 2D algorithms section for whittaker and splines --- docs/algorithms/whittaker.rst | 3 +- docs/algorithms_2d/spline_2d.rst | 59 ++++++++++++++++++++++++++--- docs/algorithms_2d/whittaker_2d.rst | 48 ++++++++++++++++++++++- docs/contributing.rst | 3 +- 4 files changed, 105 insertions(+), 8 deletions(-) diff --git a/docs/algorithms/whittaker.rst b/docs/algorithms/whittaker.rst index f1ed4d6..513cf01 100644 --- a/docs/algorithms/whittaker.rst +++ b/docs/algorithms/whittaker.rst @@ -3,7 +3,8 @@ Whittaker Baselines =================== The contents of :mod:`pybaselines.whittaker` contain Whittaker-smoothing-based -algorithms for fitting the baseline. +algorithms for fitting the baseline. Note that Whittaker smoothing is often +also referred to as Whittaker-Henderson smoothing. Introduction ------------ diff --git a/docs/algorithms_2d/spline_2d.rst b/docs/algorithms_2d/spline_2d.rst index cb34386..a771657 100644 --- a/docs/algorithms_2d/spline_2d.rst +++ b/docs/algorithms_2d/spline_2d.rst @@ -7,14 +7,63 @@ Introduction The two dimensional extension of penalized splines (P-splines) for baseline correction within pybaselines follows the framework of Eilers, Currie, and Durbán -from `[1] `_. The exact equations will be -omitted here (those interested should read the paper, it is very good), but the end result -is that the normal equation for solving the penalized system can be expressed as a +from `[1] `_. + +Let the number of rows be :math:`M` and the number of columns :math:`N` within the matrix +of measured data :math:`Y`. Note that :math:`y` is the flattened array of matrix :math:`Y` +with length :math:`M * N`. Let :math:`Y` be a function of :math:`x` along the rows and :math:`z` +along the columns, ie. :math:`Y_{ij} = f(x_i, z_j)`, and :math:`B_r(x_i)` and :math:`B_c(z_j)` represent +the spline basis matrices along the rows and columns, respectively, each with a number of +knots :math:`g` and `h`. Finally, let :math:`B = B_c \otimes B_r` denote the kronecker product of +the basis matrices for the columns and rows, which represents the overall two dimensional tensor +product spline basis. Analogous to the 1D case, the goal is to make the baseline, :math:`V` match the measured +data as well as it can while also penalizing the difference between spline coefficients, resulting +in the following minimization: + +.. math:: + + \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - \sum\limits_{k}^g \sum\limits_{l}^h B_{r,k}(x_i) B_{c,l}(z_j) c_{kl})^2 + + \lambda_r \sum\limits_{i}^{g - d_r} (c_{i\bullet} \Delta^{d_r})^2 + + \lambda_c \sum\limits_{j}^{h - d_c} (\Delta^{d_c} c_{j\bullet})^2 + +and + +.. math:: + + v = \sum\limits_{i}^g \sum\limits_{j}^h B_{r,i} B_{c,j} c_{ij} + + +where :math:`Y_{ij}` is the measured data, :math:`v` is the flattened estimated baseline, :math:`c` +is the matrix of spline coefficients, :math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the +penalty along the columns, :math:`W_{ij}` is the weighting, :math:`\Delta^{d_r}` is the finite-difference +operator of order :math:`d_r` along each row of :math:`c`, :math:`c_{i\bullet}`, and :math:`\Delta^{d_c}` is the +finite-difference operator of order :math:`d_c` along each column of :math:`c`, :math:`c_{j\bullet}`. + +The resulting linear equation for solving the above minimization is: + +.. math:: + + (B^{\top} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) c = B^{\top} W_{diag} y + +and the baseline is then: + +.. math:: + + v = B c + +where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix +version of :math:`\Delta^d`, as already explained for the :ref:`1D case `. +Further, :math:`\otimes` denotes the Kronecker product, and :math:`I_g` and :math:`I_h` are the identity +matrices of length :math:`g` and :math:`h`, respectively. After solving, the array :math:`v` can then be +reshaped into the matrix :math:`V`. + +Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the above equation +can be optimized following `[1] `_ and expressed as a `generalized linear array model `_ which allows directly using the matrices of the measured data, :math:`Y`, and the weights, :math:`W`, rather than flattening them, which significantly reduces the required -memory and computation time. - +memory and computation time. The exact equations will be omitted, but curious readers are free +to read Eilers, Currie, and Durbán's paper or look at the source code of pybaselines. Algorithms ---------- diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst index f0a2da1..327ab19 100644 --- a/docs/algorithms_2d/whittaker_2d.rst +++ b/docs/algorithms_2d/whittaker_2d.rst @@ -45,7 +45,53 @@ Since the analytical solution for 2D requires matrices of shape :math:`(M*N, M*N memory and computationally expensive to solve. Although the left hand side of the equation is still sparse and symmetric, it cannot be solved as easily compared to the 1D case since the bandwidth is no longer small due to the penalties along both the rows and columns (plus the -sparse solver currently available in scipy cannot make use of the symmetric nature of the matrix). +sparse solver currently available in SciPy cannot make use of the symmetric nature of the matrix; +using `Cholesky factorization `_ does provide a speed +up but still does not scale well above ~1000x1000 matrices). However... + +Eigendecomposition +~~~~~~~~~~~~~~~~~~ + +By following the excellent insights laid out by G. Biessy in `[2] `_, +the dimensionality of the system can be reduced by using eigendecomposition on each of the two +penalty matrices, :math:`D_{d_r}^{\top} D_{d_r}` and :math:`D_{d_c}^{\top} D_{d_c}`. (Note that speeding up +Whittaker smoothing using `factorization in 1D `_ and +`eigendecomposition in nD (great paper) `_ has already been +done, although they require using a fixed difference order, and, in the second case, of using +different boundary conditions that do not translate well from smoothing to baseline correction). +The general eigendecomposition of the penalty matrix gives + +.. math:: + + D_{d}^{\top} D_{d} = U \Sigma U^{\top} + +where :math:`U` is the matrix of eigenvectors and :math:`\Sigma` is a diagonal matrix +with the eigenvalues along the diagonal. Letting :math:`B = U_c \otimes U_r` denote the kronecker +product of the eigenvector matrices of the penalty for the columns and rows, and :math:`g` and +:math:`h` denote the number of eigenvectors along the rows and columns, respectively, the linear equation +can be rewritten as: + +.. math:: + + (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) c = B^{\top} W_{diag} y + +and the baseline is then: + +.. math:: + + v = B c + +The beauty of this reparameterization when applied to baseline correction is twofold: + +1) The number of eigenvalues required to approximate the analytical solution depends on + the required smoothness, ie. some constant approximated by :math:`\lambda / (\text{number of data points})`. + Baselines require much less smoothness than smoothing, so the number of eigenvalues is relatively + low (from testing, ~5-10 for polynomial baselines and ~15-25 for sinusoidal baselines) +2) Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the + above equation can be further optimized by expressing it as a + `generalized linear array model `_, + following the brilliant insights of `Eilers, Currie, and Durbán `_, + exactly as was done for 2D penalized splines. .. note:: diff --git a/docs/contributing.rst b/docs/contributing.rst index c200ee4..ceb7326 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -7,7 +7,8 @@ Contributions are welcomed and greatly appreciated. Bugs Reports/Feedback ~~~~~~~~~~~~~~~~~~~~~ -Report bugs or give feedback by filing an issue at https://github.com/derb12/pybaselines/issues. +Report bugs, ask questions, or give feedback by filing an issue +at https://github.com/derb12/pybaselines/issues. If you are reporting a bug, please include: From 1ca950d02fb4591b9bc5ae9f15572405c3ddb31c Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:57:27 -0500 Subject: [PATCH 54/56] DOCS: Finalize 2D algorithms explanation Also updated pinned dependencies. --- docs/algorithms_2d/spline_2d.rst | 63 ++++++++++++++------- docs/algorithms_2d/whittaker_2d.rst | 32 ++++++----- docs/installation.rst | 11 ++-- requirements/requirements-development.txt | 10 ++-- requirements/requirements-documentation.txt | 18 +++--- requirements/requirements.txt | 4 +- 6 files changed, 80 insertions(+), 58 deletions(-) diff --git a/docs/algorithms_2d/spline_2d.rst b/docs/algorithms_2d/spline_2d.rst index a771657..638bd5b 100644 --- a/docs/algorithms_2d/spline_2d.rst +++ b/docs/algorithms_2d/spline_2d.rst @@ -12,58 +12,77 @@ from `[1] `_. Let the number of rows be :math:`M` and the number of columns :math:`N` within the matrix of measured data :math:`Y`. Note that :math:`y` is the flattened array of matrix :math:`Y` with length :math:`M * N`. Let :math:`Y` be a function of :math:`x` along the rows and :math:`z` -along the columns, ie. :math:`Y_{ij} = f(x_i, z_j)`, and :math:`B_r(x_i)` and :math:`B_c(z_j)` represent +along the columns, ie. :math:`Y_{ij} = f(x_i, z_j)`, and :math:`B_r(x)` and :math:`B_c(z)` represent the spline basis matrices along the rows and columns, respectively, each with a number of -knots :math:`g` and `h`. Finally, let :math:`B = B_c \otimes B_r` denote the kronecker product of -the basis matrices for the columns and rows, which represents the overall two dimensional tensor -product spline basis. Analogous to the 1D case, the goal is to make the baseline, :math:`V` match the measured +knots :math:`g` and `h`. Analogous to the 1D case, the goal is to make the baseline, :math:`V` match the measured data as well as it can while also penalizing the difference between spline coefficients, resulting in the following minimization: .. math:: - \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - \sum\limits_{k}^g \sum\limits_{l}^h B_{r,k}(x_i) B_{c,l}(z_j) c_{kl})^2 - + \lambda_r \sum\limits_{i}^{g - d_r} (c_{i\bullet} \Delta^{d_r})^2 - + \lambda_c \sum\limits_{j}^{h - d_c} (\Delta^{d_c} c_{j\bullet})^2 + \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - \sum\limits_{k}^g \sum\limits_{l}^h B_{r,k}(x_i) B_{c,l}(z_j) \alpha_{kl})^2 + + \lambda_r \sum\limits_{i}^{g - d_r} (\alpha_{i\bullet} \Delta^{d_r})^2 + + \lambda_c \sum\limits_{j}^{h - d_c} (\Delta^{d_c} \alpha_{j\bullet})^2 and .. math:: - v = \sum\limits_{i}^g \sum\limits_{j}^h B_{r,i} B_{c,j} c_{ij} + V = \sum\limits_{i}^g \sum\limits_{j}^h B_{r,i} B_{c,j} \alpha_{ij} -where :math:`Y_{ij}` is the measured data, :math:`v` is the flattened estimated baseline, :math:`c` -is the matrix of spline coefficients, :math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the +where :math:`Y_{ij}` is the measured data, :math:`\alpha` is the matrix of spline coefficients, +:math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the penalty along the columns, :math:`W_{ij}` is the weighting, :math:`\Delta^{d_r}` is the finite-difference -operator of order :math:`d_r` along each row of :math:`c`, :math:`c_{i\bullet}`, and :math:`\Delta^{d_c}` is the -finite-difference operator of order :math:`d_c` along each column of :math:`c`, :math:`c_{j\bullet}`. +operator of order :math:`d_r` along each row of :math:`\alpha`, :math:`\alpha_{i\bullet}`, and :math:`\Delta^{d_c}` is the +finite-difference operator of order :math:`d_c` along each column of :math:`\alpha`, :math:`\alpha_{j\bullet}`. -The resulting linear equation for solving the above minimization is: +Let :math:`B = B_c \otimes B_r` denote the kronecker product of the basis matrices for the columns and rows, +which represents the overall two dimensional tensor product spline basis. The resulting linear equation for +solving the above minimization is: .. math:: - (B^{\top} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) c = B^{\top} W_{diag} y + (B^{\top} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B^{\top} W_{diag} y and the baseline is then: .. math:: - v = B c + v = B \alpha -where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix -version of :math:`\Delta^d`, as already explained for the :ref:`1D case `. -Further, :math:`\otimes` denotes the Kronecker product, and :math:`I_g` and :math:`I_h` are the identity -matrices of length :math:`g` and :math:`h`, respectively. After solving, the array :math:`v` can then be -reshaped into the matrix :math:`V`. +where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, :math:`v` is the flattened +estimated baseline, and :math:`D_d` is the matrix version of :math:`\Delta^d`, as already explained for +the :ref:`1D case `. Further, :math:`\otimes` denotes the Kronecker +product, and :math:`I_g` and :math:`I_h` are the identity matrices of length :math:`g` and +:math:`h`, respectively. After solving, the array :math:`v` can then be reshaped into the matrix :math:`V`. Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the above equation can be optimized following `[1] `_ and expressed as a `generalized linear array model `_ which allows directly using the matrices of the measured data, :math:`Y`, and the weights, :math:`W`, rather than flattening them, which significantly reduces the required -memory and computation time. The exact equations will be omitted, but curious readers are free -to read Eilers, Currie, and Durbán's paper or look at the source code of pybaselines. +memory and computation time. + +.. _generalized-linear-array-model-explanation: + +Let :math:`F` be the +`face-splitting product operator `_ +of a matrix with iself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\top}) \odot (1_{g}^{\top} \otimes B_r)` +and :math:`F(B_c) = (B_c \otimes 1_{h}^{\top}) \odot (1_{h}^{\top} \otimes B_c)`, where +:math:`1_g` and :math:`1_h` are vectors of ones of length :math:`g` and :math:`h`, respecitvely, +and :math:`\odot` signifies elementwise multiplication. Then the linear equation can be rewritten as: + +.. math:: + + (F(B_r)^{\top} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B_{r}^{\top} (W \odot Y) B_c + +and the baseline is: + +.. math:: + + V = B_r \alpha B_{c}^{\top} + Algorithms ---------- diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst index 327ab19..18211ec 100644 --- a/docs/algorithms_2d/whittaker_2d.rst +++ b/docs/algorithms_2d/whittaker_2d.rst @@ -47,7 +47,7 @@ still sparse and symmetric, it cannot be solved as easily compared to the 1D cas bandwidth is no longer small due to the penalties along both the rows and columns (plus the sparse solver currently available in SciPy cannot make use of the symmetric nature of the matrix; using `Cholesky factorization `_ does provide a speed -up but still does not scale well above ~1000x1000 matrices). However... +up but still does not scale well above ~500x500 sized matrices). However... Eigendecomposition ~~~~~~~~~~~~~~~~~~ @@ -55,10 +55,10 @@ Eigendecomposition By following the excellent insights laid out by G. Biessy in `[2] `_, the dimensionality of the system can be reduced by using eigendecomposition on each of the two penalty matrices, :math:`D_{d_r}^{\top} D_{d_r}` and :math:`D_{d_c}^{\top} D_{d_c}`. (Note that speeding up -Whittaker smoothing using `factorization in 1D `_ and -`eigendecomposition in nD (great paper) `_ has already been -done, although they require using a fixed difference order, and, in the second case, of using -different boundary conditions that do not translate well from smoothing to baseline correction). +Whittaker smoothing using `factorization in 1D `_ and using the +`analytical eigenvalues in nD (great paper) `_ are established +methods, although they require using a fixed difference order, and, in the second case, of using +different boundary conditions that unfortunately do not translate well from smoothing to baseline correction). The general eigendecomposition of the penalty matrix gives .. math:: @@ -73,33 +73,34 @@ can be rewritten as: .. math:: - (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) c = B^{\top} W_{diag} y + (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\top} W_{diag} y and the baseline is then: .. math:: - v = B c + v = B \alpha The beauty of this reparameterization when applied to baseline correction is twofold: 1) The number of eigenvalues required to approximate the analytical solution depends on - the required smoothness, ie. some constant approximated by :math:`\lambda / (\text{number of data points})`. - Baselines require much less smoothness than smoothing, so the number of eigenvalues is relatively - low (from testing, ~5-10 for polynomial baselines and ~15-25 for sinusoidal baselines) + the required smoothness, ie. some constant approximated by :math:`\lambda / (\text{number of data points})` + that does not appreciably change with data size. Baselines require much less smoothness than + smoothing, so the number of eigenvalues is relatively low (from testing, ~5-10 for low order + polynomial baselines and ~15-25 for sinusoidal baselines). 2) Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the above equation can be further optimized by expressing it as a `generalized linear array model `_, following the brilliant insights of `Eilers, Currie, and Durbán `_, - exactly as was done for 2D penalized splines. + exactly as :ref:`explained for 2D penalized splines `. .. note:: - For two dimensional data, Whittaker-smoothing-based algorithms take a single ``lam`` + For two dimensional data, Whittaker-smoothing-based algorithms take a single ``lam``, parameter that can either be a single number, in which case both the rows and columns will use the same smoothing parameter, ie. :math:`\lambda_r = \lambda_c`, or a sequence - of two numbers (:math:`\lambda_r`, :math:`\lambda_c`) - to penalize the rows and columns with different values. + of two numbers (:math:`\lambda_r`, :math:`\lambda_c`) to use different values for the + rows and columns. Algorithms ---------- @@ -174,6 +175,7 @@ iasls (Improved Asymmetric Least Squares) :meth:`~.Baseline2D.iasls`: :ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. .. plot:: :align: center @@ -219,6 +221,7 @@ drpls (Doubly Reweighted Penalized Least Squares) :meth:`~.Baseline2D.drpls`: :ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. .. plot:: :align: center @@ -249,6 +252,7 @@ aspls (Adaptive Smoothness Penalized Least Squares) :meth:`~.Baseline2D.aspls`: :ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. .. plot:: :align: center diff --git a/docs/installation.rst b/docs/installation.rst index 9a67766..26e4a92 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -8,10 +8,10 @@ Installation Dependencies ~~~~~~~~~~~~ -pybaselines requires `Python `_ version 3.6 or later and the following libraries: +pybaselines requires `Python `_ version 3.8 or later and the following libraries: -* `NumPy `_ (>= 1.14) -* `SciPy `_ (>= 1.0) +* `NumPy `_ (>= 1.18) +* `SciPy `_ (>= 1.5) All of the required libraries should be automatically installed when @@ -22,7 +22,7 @@ Optional Dependencies pybaselines has the following optional dependencies: -* `numba `_ (>= 0.45): +* `numba `_ (>= 0.49): speeds up calculations used by the following functions: * :meth:`~Baseline.loess` @@ -76,7 +76,8 @@ Development Version The sources for pybaselines can be downloaded from the `GitHub repo `_. -To directly install the current version of pybaselines from GitHub, run: +To directly install the current version of pybaselines from GitHub, +ensure `git `_ is installed and then run: .. code-block:: console diff --git a/requirements/requirements-development.txt b/requirements/requirements-development.txt index db6939b..083d85f 100644 --- a/requirements/requirements-development.txt +++ b/requirements/requirements-development.txt @@ -2,12 +2,12 @@ -r requirements-documentation.txt # for linting -ruff +ruff==0.2.1 # for testing -pytest==6.2.5 +pytest==8.0.0 # for creating releases -bump-my-version -twine==3.6.0 -build +bump-my-version==0.17.4 +twine==5.0.0 +build==1.0.3 diff --git a/requirements/requirements-documentation.txt b/requirements/requirements-documentation.txt index bc1fe50..950fe61 100644 --- a/requirements/requirements-documentation.txt +++ b/requirements/requirements-documentation.txt @@ -1,12 +1,10 @@ -r requirements.txt -sphinx==4.3.1 -sphinx-rtd-theme==1.0.0 -# pin docutils to v0.17.1 since v0.18 not yet compatible with Sphinx -docutils==0.17.1 -sphinx-autoapi==1.8.4 -sphinx-gallery==0.10.1 -matplotlib==3.3.3 -pentapy==1.1.2 -numba==0.56.0 -numpydoc \ No newline at end of file +sphinx==7.2.6 +sphinx-rtd-theme==2.0.0 +sphinx-autoapi==3.0.0 +sphinx-gallery==0.15.0 +matplotlib==3.8.2 +pentapy==1.2.0 +numba==0.59.0 +numpydoc==1.6.0 \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6b9bd11..ec114b4 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,2 @@ -numpy==1.20.3 -scipy==1.7.3 +numpy==1.26.4 +scipy==1.12.0 From a5c47918de6a823688c31945028499e0e8ba104c Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 12 Feb 2024 19:24:16 -0500 Subject: [PATCH 55/56] TEST: Finished most tests for 2D whittaker smoothing with eigendecomposition --- pybaselines/two_d/_spline_utils.py | 8 +- pybaselines/two_d/_whittaker_utils.py | 120 +++++++++++++--- tests/test_api.py | 4 +- tests/test_spline.py | 1 + tests/two_d/test_api.py | 4 +- tests/two_d/test_whittaker_utils.py | 196 ++++++++++++++++++++++++-- 6 files changed, 293 insertions(+), 40 deletions(-) diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 5a61836..fb10bfe 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -13,7 +13,7 @@ from .._compat import csr_object from .._spline_utils import _spline_basis, _spline_knots from .._validation import _check_array, _check_scalar_variable -from ._whittaker_utils import PenalizedSystem2D +from ._whittaker_utils import PenalizedSystem2D, _face_splitting class PSpline2D(PenalizedSystem2D): @@ -130,10 +130,8 @@ def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam 'functions, which is the number of knots + spline degree - 1' )) - el = np.ones((1, self._num_bases[0])) - ek = np.ones((1, self._num_bases[1])) - self._G_r = kron(self.basis_r, el).multiply(kron(el, self.basis_r)) - self._G_c = kron(self.basis_c, ek).multiply(kron(ek, self.basis_c)) + self._G_r = _face_splitting(self.basis_r) + self._G_c = _face_splitting(self.basis_c) def same_basis(self, num_knots=100, spline_degree=3): """ diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 55638cf..5bb2eb5 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -6,6 +6,8 @@ """ +import warnings + import numpy as np from scipy.linalg import eig_banded, eigh_tridiagonal, solve from scipy.sparse import kron @@ -13,7 +15,35 @@ from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix from .._compat import identity -from .._validation import _check_lam, _check_scalar_variable +from .._validation import _check_lam, _check_scalar, _check_scalar_variable +from ..utils import ParameterWarning + + +def _face_splitting(basis): + """ + Performs the face-splitting product on the input two dimensional basis matrix. + + Parameters + ---------- + basis : numpy.ndarray or scipy.sparse.spmatrix or scipy.sparse._sparray + The two dimensional dense or sparse matrix, with shape (`M`, `N`). + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The face-splitting product of the input basis matrix with itself, with + shape (`M`, `N**2`). + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + https://en.wikipedia.org/wiki/Khatri%E2%80%93Rao_product#Face-splitting_product + + """ + ones = np.ones((1, basis.shape[1])) + return kron(basis, ones).multiply(kron(ones, basis)) class PenalizedSystem2D: @@ -244,25 +274,19 @@ def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): self.coef = None self._basis = None self._num_points = data_size - if max_eigens is None or None in max_eigens: + max_eigens = _check_scalar(max_eigens, 2, fill_scalar=True)[0] + if (max_eigens == np.array([None, None])).all(): self._num_bases = data_size self._using_svd = False + elif None in max_eigens: + raise ValueError('eigenvalues must be None or non-None integers') else: - # TODO need to check to ensure max_eigens is <= data_size and otherwise emit - # an error; if max_eigens is >~ 40 should emit an error saying too many - # also check that it is greater than 0 or maybe 1 self._num_bases = _check_scalar_variable( max_eigens, allow_zero=False, variable_name='eigenvalues', two_d=True, dtype=int ) self._using_svd = True self.reset_diagonals(lam, diff_order) - if self._using_svd: - el = np.ones((1, self._num_bases[0])) - ek = np.ones((1, self._num_bases[1])) - self._G_r = kron(self.basis_r, el).multiply(kron(el, self.basis_r)) - self._G_c = kron(self.basis_c, ek).multiply(kron(ek, self.basis_c)) - def reset_diagonals(self, lam=1, diff_order=2): """ Resets the diagonals of the system and all of the attributes. @@ -317,36 +341,80 @@ def reset_diagonals(self, lam=1, diff_order=2): self.basis_r = vectors_rows self.basis_c = vectors_columns + self._G_r = _face_splitting(self.basis_r) + self._G_c = _face_splitting(self.basis_c) + def _calc_eigenvalues(self, data_points, diff_order, num_eigens): """ Calculate the eigenvalues and eigenvectors for the corresponding penalty matrix. Parameters ---------- - data_points : _type_ - _description_ - diff_order : _type_ - _description_ + data_points : int + The number of rows and columns of the square penalty matrix. + diff_order : int + The difference order of the penalty. num_eigens : int - The + The number of smallest eigenvalues that will be used to represent the penalty matrix. Returns ------- - eigenvalues : np.ndarray, shape (num_eigens,) + eigenvalues : np.ndarray, shape (`num_eigens`,) The eigenvalues of the penalty matrix for the corresponding difference order. - eigenvectors : np.ndarray, shape (data_points, num_eigens) + eigenvectors : np.ndarray, shape (`data_points`, `num_eigens`) The eigenvectors for the penalty matrix. + Raises + ------ + ValueError + Raised if the number of eigenvalues is greater than the number of data + points. + + Warns + ----- + ParameterWarning + If `num_eigens` is less than or equal to `diff_order`, a warning is issue since + the diagonals of the resulting matrix will no longer be guaranteed to be + positive-definite. Is also emitted if `num_eigens` is greater than 50 since + for 2D baseline correction, less than 20 eigenvalues is typically required. + Notes ----- + The lowest `diff_order` eigenvalues are supposed to be zero while they end up + being ~ +- 1e-15, so their values are set to 0. + The penalty matrix has a matrix rank (number of nonzero eigenvalues) of - ``data_points - num_eigens``. + ``data_points - diff_order``. The lowest `diff_order` eigenvalues are all + zero, so the system is not guaranteed to be positive definite when solving the + penalized least squares fit unless all weights are >~ 1e-5 (just a guess, but + the meaning is that weights must be some magnitude greater than zero), which is + not guaranteed for all Whittaker-smoothing-based algorithms. Thus, a clear + warning needs to be issued since otherwise this detail can be hidden. + + References + ---------- + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. """ - # TODO the lowest diff_order eigenvalues should be zero, while they end up being - # ~ +- 1e-15, will this affect any calculations or can it be left as it? If it does - # need set to 0, do the eigenvectors likewise need updated for that? penalty_bands = diff_penalty_diagonals(data_points, diff_order, lower_only=True) + if num_eigens > data_points: + raise ValueError(( + 'The maximum number of eigenvalues cannot be greater ' + 'than the number of data points.' + )) + elif num_eigens <= diff_order: + warnings.warn( + ('Setting the number of eigenvalues to be greater than the difference order ' + 'in order to not cause numerical instability'), ParameterWarning, stacklevel=2 + ) + elif num_eigens > 50: + warnings.warn( + ('For 2D baseline correction, typically only 5-20 eigenvalues are required to ' + 'fully approximate the baseline, and higher values will cause signifcant ' + 'slowdown'), ParameterWarning, stacklevel=2 + ) + if diff_order == 1: eigenvalues, eigenvectors = eigh_tridiagonal( penalty_bands[0], penalty_bands[1, :-1], select='i', @@ -357,6 +425,11 @@ def _calc_eigenvalues(self, data_points, diff_order, num_eigens): penalty_bands, lower=True, select='i', select_range=(0, num_eigens - 1), overwrite_a_band=True ) + + # TODO do the corresponding eigenvectors in eigenvectors[:, :diff_order] need updated + # too to match the resetting of the eigenvalues? + eigenvalues[:diff_order] = 0 + return eigenvalues, eigenvectors def update_penalty(self, lam): @@ -396,7 +469,8 @@ def same_basis(self, diff_order=2, max_eigens=None): """ # TODO should give a way to update only one of the basis functions, which # would also need to update the penalty - if max_eigens is None or not self._using_svd: + max_eigens = _check_scalar(max_eigens, 2, fill_scalar=True)[0] + if (max_eigens == np.array([None, None])).all() or not self._using_svd: return False diff_order = _check_scalar_variable( diff --git a/tests/test_api.py b/tests/test_api.py index 356afc0..1b83efd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -111,13 +111,13 @@ def test_all_methods(self, method_and_class): baseline_class(self.x, check_finite=False, assume_sorted=True), method )(fit_data, **kwargs) - assert_allclose(api_baseline, class_baseline, rtol=1e-14, atol=1e-14) + assert_allclose(api_baseline, class_baseline, rtol=1e-12, atol=1e-12) assert len(api_params.keys()) == len(class_params.keys()) for key, value in api_params.items(): assert key in class_params class_value = class_params[key] if isinstance(value, (int, float, np.ndarray, list, tuple)): - assert_allclose(value, class_value, rtol=1e-14, atol=1e-14) + assert_allclose(value, class_value, rtol=1e-12, atol=1e-12) else: assert value == class_value diff --git a/tests/test_spline.py b/tests/test_spline.py index c701e3f..28c5511 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -354,6 +354,7 @@ def test_diff_orders(self, diff_order): lam = {1: 1e2, 3: 1e10}[diff_order] self.class_func(self.y, lam=lam, diff_order=diff_order) + @pytest.mark.skip(reason='overflow will be addressed next version') def test_avoid_overflow_warning(self, no_noise_data_fixture): """ Ensures no warning is emitted for exponential overflow. diff --git a/tests/two_d/test_api.py b/tests/two_d/test_api.py index d95f2e8..bba287f 100644 --- a/tests/two_d/test_api.py +++ b/tests/two_d/test_api.py @@ -110,13 +110,13 @@ def test_all_methods(self, method_and_class): baseline_class(self.x, self.z, check_finite=False, assume_sorted=True), method )(fit_data, **kwargs) - assert_allclose(api_baseline, class_baseline, rtol=1e-14, atol=1e-14) + assert_allclose(api_baseline, class_baseline, rtol=1e-12, atol=1e-12) assert len(api_params.keys()) == len(class_params.keys()) for key, value in api_params.items(): assert key in class_params class_value = class_params[key] if isinstance(value, (int, float, np.ndarray, list, tuple)): - assert_allclose(value, class_value, rtol=1e-14, atol=1e-14) + assert_allclose(value, class_value, rtol=1e-12, atol=1e-12) else: assert value == class_value diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 3e20af9..962fb1e 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -9,10 +9,12 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest +from scipy.linalg import eig_banded from scipy.sparse import issparse, kron from scipy.sparse.linalg import spsolve -from pybaselines._compat import identity +from pybaselines._banded_utils import diff_penalty_diagonals +from pybaselines._compat import identity, dia_object from pybaselines.two_d import _spline_utils, _whittaker_utils from pybaselines.utils import difference_matrix @@ -25,7 +27,7 @@ def test_solve_penalized_system(small_data2d, diff_order, lam): """ Tests the accuracy of the penalized system solver. - Not really useful at the moment, but will be mroe useful if the solver changes + Not really useful at the moment, but will be more useful if the solver changes from the current basic sparse solver. """ @@ -46,9 +48,8 @@ def test_solve_penalized_system(small_data2d, diff_order, lam): small_data2d.shape, lam=lam, diff_order=diff_order ) - # TODO replace with np.random.default_rng when min numpy version is >= 1.17 - weights = np.random.RandomState(0).normal(0.8, 0.05, small_data2d.size) - weights = np.clip(weights, 0, 1).astype(float, copy=False).ravel() + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() penalty.setdiag(penalty.diagonal() + weights) @@ -134,9 +135,8 @@ def test_compare_to_psplines(data_fixture2d, lam, diff_order): y.shape, lam=lam, diff_order=diff_order ) - # TODO replace with np.random.default_rng when min numpy version is >= 1.17 - weights = np.random.RandomState(0).normal(0.8, 0.05, y.shape) - weights = np.clip(weights, 0, 1).astype(float, copy=False) + weights = np.random.default_rng(0).normal(0.8, 0.05, y.shape) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False) spline_output = pspline.solve(y, weights=weights) whittaker_output = whittaker_system.solve(y.ravel(), weights=weights.ravel()) @@ -165,3 +165,183 @@ def test_penalized_system_add_penalty(diff_order): assert_allclose(whittaker_system.penalty.toarray(), expected_output, rtol=1e-12, atol=1e-13) # and the main diagonal assert_allclose(whittaker_system.main_diagonal, expected_diagonal, rtol=1e-12, atol=1e-13) + + +def test_face_splitting(): + """Ensures the face-splittng algorithms works as intended.""" + basis = np.array([ + [1., 2, 3], + [4, 5, 6], + [7, 8, 9], + [10, 11, 12] + ]) + + output = _whittaker_utils._face_splitting(basis) + + assert output.shape == (basis.shape[0], basis.shape[1]**2) + assert issparse(output) + + expected_output = kron(basis, np.ones((1, basis.shape[1]))).multiply( + kron(np.ones((1, basis.shape[1])), basis) + ) + assert_allclose(output.toarray(), expected_output.toarray(), rtol=0, atol=1e-12) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_whittaker_system(small_data2d, diff_order, lam): + """ + Tests the accuracy of the Whittaker system solver when not using eigendecomposition. + + Not really useful at the moment, but will be more useful if the solver changes + from the current basic sparse solver. + + """ + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=None + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.flatten()) + output = penalized_system.solve(small_data2d.flatten(), weights) + + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_whittaker_system_setup_no_eigenvalues(small_data2d, diff_order, lam): + """Ensure the WhittakerSystem2D setup is correct when not using eigendecomposition.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=None + ) + + assert_array_equal(penalized_system._num_bases, num_bases) + + assert issparse(penalized_system.penalty) + assert_allclose( + penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 + ) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_whittaker_system_setup_eigenvalues(small_data2d, diff_order, lam): + """Ensure the WhittakerSystem2D setup is correct when using eigendecomposition.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + max_eigens = np.array([5, 10]) + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=max_eigens + ) + + assert_array_equal(penalized_system._num_bases, max_eigens) + + eigenvalues_rows, expected_basis_rows = eig_banded( + diff_penalty_diagonals(small_data2d.shape[0], diff_order_x, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, max_eigens[0] - 1) + ) + penalty_rows = kron( + lam_x * dia_object((eigenvalues_rows, 0), shape=(max_eigens[0], max_eigens[0])), + identity(max_eigens[1]) + ) + + eigenvalues_cols, expected_basis_cols = eig_banded( + diff_penalty_diagonals(small_data2d.shape[1], diff_order_z, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, max_eigens[1] - 1) + ) + penalty_cols = kron( + identity(max_eigens[0]), + lam_z * dia_object((eigenvalues_cols, 0), shape=(max_eigens[1], max_eigens[1])) + ) + + assert penalized_system.penalty.shape == (np.prod(max_eigens),) + assert_allclose( + penalized_system.penalty, (penalty_rows + penalty_cols).diagonal(), rtol=1e-12, atol=1e-12 + ) + assert_allclose( + penalized_system.basis_r, expected_basis_rows, rtol=1e-12, atol=1e-12 + ) + assert_allclose( + penalized_system.basis_c, expected_basis_cols, rtol=1e-12, atol=1e-12 + ) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) +def test_whittaker_system_diff_order_fails(small_data2d, diff_order): + """Ensures a difference order of less than 1 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, diff_order=diff_order, max_eigens=None + ) + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, diff_order=diff_order, max_eigens=(5, 5) + ) + + +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_whittaker_system_negative_lam_fails(small_data2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D(small_data2d.shape, lam=lam, max_eigens=None) + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, max_eigens=(5, 5) + ) + + +@pytest.mark.parametrize('max_eigens', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_whittaker_system_negative_maxeigens_fails(small_data2d, max_eigens): + """Ensures a max_eigens value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, max_eigens=max_eigens + ) + + +@pytest.mark.parametrize('max_eigens', ([None, 5], [3, None], np.array([None, 6]))) +def test_whittaker_system_None_and_nonNone_maxeigens_fails(small_data2d, max_eigens): + """Ensures a max_eigens cannot mix None with a non-None value.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, max_eigens=max_eigens + ) From 86fb254e6b077088905e9d185fdc93d17a8f2f6a Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Mon, 12 Feb 2024 20:01:11 -0500 Subject: [PATCH 56/56] MAINT: Fix linting errors and bump numpy version Increased min numpy version from 1.18 to 1.20 to allow using dtype within numpy.concatenate. --- .github/workflows/python-test.yml | 4 ++-- docs/installation.rst | 2 +- examples/general/plot_algorithm_convergence.py | 4 ++-- examples/whittaker/plot_whittaker_solvers.py | 3 ++- pybaselines/classification.py | 6 +++--- pybaselines/smooth.py | 2 +- pybaselines/spline.py | 10 ++++++---- pybaselines/two_d/_whittaker_utils.py | 4 +++- pybaselines/two_d/spline.py | 8 +++++--- pybaselines/whittaker.py | 11 +++++++---- pyproject.toml | 5 ++--- tests/conftest.py | 8 ++++---- tests/test_spline_utils.py | 2 +- tests/two_d/test_spline_utils.py | 2 +- 14 files changed, 40 insertions(+), 31 deletions(-) diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 3e43282..4120553 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -44,7 +44,7 @@ jobs: - name: Install required dependencies run: | python -m pip install --upgrade pip - python -m pip install "numpy>=1.18" "scipy>=1.5" pytest + python -m pip install "numpy>=1.20" "scipy>=1.5" pytest - name: Test with required dependencies run: pytest . @@ -79,7 +79,7 @@ jobs: - name: Install minimum dependencies run: | python -m pip install --upgrade pip - python -m pip install numpy==1.18 scipy==1.5 pytest + python -m pip install numpy==1.20 scipy==1.5 pytest - name: Test with minimum required dependencies run: pytest . diff --git a/docs/installation.rst b/docs/installation.rst index 26e4a92..b2e7f2d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -10,7 +10,7 @@ Dependencies pybaselines requires `Python `_ version 3.8 or later and the following libraries: -* `NumPy `_ (>= 1.18) +* `NumPy `_ (>= 1.20) * `SciPy `_ (>= 1.5) diff --git a/examples/general/plot_algorithm_convergence.py b/examples/general/plot_algorithm_convergence.py index b4557a8..cd99290 100644 --- a/examples/general/plot_algorithm_convergence.py +++ b/examples/general/plot_algorithm_convergence.py @@ -9,8 +9,8 @@ the measured tolerance value at each iteration. The `tol_history` parameter can be helpful for determining appropriate `max_iter` or `tol` values. -In this example, the convergence of the :meth:`~.Baseline.asls` and :meth:`~.Baseline.aspls` functions -will be compared. asls is a relatively simple calculation that sets its weighting +In this example, the convergence of the :meth:`~.Baseline.asls` and :meth:`~.Baseline.aspls` +functions will be compared. asls is a relatively simple calculation that sets its weighting each iteration based on whether the current baseline is above or below the input data at each point. aspls has a much more intricate weighting based on the logistic distribution of the residuals (data minus baseline); further, aspls also updates an additional diff --git a/examples/whittaker/plot_whittaker_solvers.py b/examples/whittaker/plot_whittaker_solvers.py index a12e810..191aad5 100644 --- a/examples/whittaker/plot_whittaker_solvers.py +++ b/examples/whittaker/plot_whittaker_solvers.py @@ -113,7 +113,8 @@ def make_data(num_x): if not _banded_utils._HAS_PENTAPY: warnings.warn( - 'pentapy is not installed so pentapy and scipy-banded timings will be identical' + 'pentapy is not installed so pentapy and scipy-banded timings will be identical', + stacklevel=2 ) # equation obtained following similar procedure as `lam` vs data size example diff --git a/pybaselines/classification.py b/pybaselines/classification.py index f516fd5..0e12312 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -1028,9 +1028,9 @@ def _averaged_interp(x, y, mask, interp_half_window=0): mask_sum = mask.sum() if not mask_sum: # all points belong to peaks # will just interpolate between first and last points - warnings.warn('there were no baseline points found', ParameterWarning) + warnings.warn('there were no baseline points found', ParameterWarning, stacklevel=2) elif mask_sum == mask.shape[0]: # all points belong to baseline - warnings.warn('there were no peak points found', ParameterWarning) + warnings.warn('there were no peak points found', ParameterWarning, stacklevel=2) return output peak_starts, peak_ends = _find_peak_segments(mask) @@ -1153,7 +1153,7 @@ def _iter_threshold(power, num_std=3.0): if masked_power.size < 2: # need at least 2 points for std calculation warnings.warn( 'not enough baseline points found; "num_std" is likely too low', - ParameterWarning + ParameterWarning, stacklevel=2 ) break mask = power < np.mean(masked_power) + num_std * np.std(masked_power, ddof=1) diff --git a/pybaselines/smooth.py b/pybaselines/smooth.py index 51738eb..f7af183 100644 --- a/pybaselines/smooth.py +++ b/pybaselines/smooth.py @@ -178,7 +178,7 @@ def snip(self, data, max_half_window=None, decreasing=False, smooth_half_window= if half_window > (self._len - 1) // 2: warnings.warn( 'max_half_window values greater than (len(data) - 1) / 2 have no effect.', - ParameterWarning + ParameterWarning, stacklevel=2 ) half_windows[i] = (self._len - 1) // 2 diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 6cf4920..7313f89 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -638,7 +638,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -653,7 +653,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -851,7 +851,8 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -938,7 +939,8 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 5bb2eb5..5052e71 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -608,7 +608,9 @@ def basis(self): def _calc_dof(self, weights, assume_a='pos'): if not self._using_svd: # Could maybe just output a matrix of ones? - raise ValueError('Cannot calculate degrees of freedom when not using eigendecomposition') + raise ValueError( + 'Cannot calculate degrees of freedom when not using eigendecomposition' + ) lhs = self._make_btwb(weights) rhs = lhs.copy() np.fill_diagonal(lhs, lhs.diagonal() + self.penalty) diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index bf775f2..76b63a1 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -557,7 +557,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -572,7 +572,8 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -744,7 +745,8 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 5090c41..b5fa7d2 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -264,7 +264,7 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -278,7 +278,8 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -454,7 +455,8 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -530,7 +532,8 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: diff --git a/pyproject.toml b/pyproject.toml index df0085b..64601ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,9 +42,8 @@ keywords = [ ] requires-python = ">=3.8" dependencies = [ - # lowest versions supported for python 3.8 - "numpy>=1.18", - "scipy>=1.5", + "numpy>=1.20", # lowest version to allow dtype for np.concatenate + "scipy>=1.5", # lowest versions supported for python 3.8 ] [project.urls] diff --git a/tests/conftest.py b/tests/conftest.py index e4ccf45..73a95dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -441,10 +441,10 @@ def test_output(self, additional_keys=None, **kwargs): # check all entries in output param dictionary for key in total_keys: if key not in output[1]: - assert False, f'key "{key}" missing from param dictionary' + raise AssertionError(f'key "{key}" missing from param dictionary') output[1].pop(key) if output[1]: - assert False, f'unchecked keys in param dictionary: {output[1]}' + raise AssertionError(f'unchecked keys in param dictionary: {output[1]}') def test_x_ordering(self, assertion_kwargs=None, **kwargs): """Ensures arrays are correctly sorted within the function.""" @@ -688,10 +688,10 @@ def test_output(self, additional_keys=None, **kwargs): # check all entries in output param dictionary for key in total_keys: if key not in output[1]: - assert False, f'key "{key}" missing from param dictionary' + raise AssertionError(f'key "{key}" missing from param dictionary') output[1].pop(key) if output[1]: - assert False, f'unchecked keys in param dictionary: {output[1]}' + raise AssertionError(f'unchecked keys in param dictionary: {output[1]}') def test_xz_ordering(self, assertion_kwargs=None, **kwargs): """Ensures arrays are correctly sorted within the function.""" diff --git a/tests/test_spline_utils.py b/tests/test_spline_utils.py index 884b48e..3f0c53e 100644 --- a/tests/test_spline_utils.py +++ b/tests/test_spline_utils.py @@ -422,7 +422,7 @@ def test_pspline_tck_none(data_fixture): assert pspline.coef is None with pytest.raises(ValueError): - pspline.tck + tck = pspline.tck def test_pspline_tck_readonly(data_fixture): diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 9a974bb..0195717 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -263,7 +263,7 @@ def test_pspline_tck_none(data_fixture2d): assert pspline.coef is None with pytest.raises(ValueError): - pspline.tck + tck = pspline.tck def test_pspline_tck_readonly(data_fixture2d):