From 2978e0fadad8202f0fca4b9be43a5187d5160a11 Mon Sep 17 00:00:00 2001 From: Donnie Erb <55961724+derb12@users.noreply.github.com> Date: Sat, 27 Jan 2024 19:33:29 -0500 Subject: [PATCH] MAINT: Use binning instead of direct truncation for custom_bc Also finished the tests for custom_bc. --- pybaselines/optimizers.py | 116 ++++++++++++++++++++++++++------------ tests/test_optimizers.py | 64 ++++++++++++++++++--- 2 files changed, 136 insertions(+), 44 deletions(-) diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index f8604a8..d5e3c5c 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -489,11 +489,15 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, return np.maximum.reduce(baselines), params @_Algorithm._register - def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None, + def custom_bc(self, data, method='asls', regions=((None, None),), sampling=1, lam=None, diff_order=2, method_kwargs=None): """ Customized baseline correction for fine tuned stiffness of the baseline at specific regions. + Divides the data into regions with variable number of data points and then uses other + baseline algorithms to fit the truncated data. Regions with less points effectively + makes the fit baseline more stiff in those regions. + Parameters ---------- data : array-like, shape (N,) @@ -501,15 +505,15 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None method : str, optional A string indicating the algorithm to use for fitting the baseline; can be any non-optimizer algorithm in pybaselines. Default is 'asls'. - rois : array-like, shape (M, 2), optional + regions : array-like, shape (M, 2), optional The two dimensional array containing the start and stop indices for each region of - interest. Each region is defined as ``data[start:stop]``. Default is (None, None), + interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),), which will use all points. sampling : int or array-like, optional - The sampling step size for each region defined in `rois`. If `sampling` is an integer, - then all regions in `rois` will use the same index step size; if `sampling` is an - array-like, its length must be equal to `M`, the first dimension in `rois`. Default - is 1, which will use all points. + The sampling step size for each region defined in `regions`. If `sampling` is an + integer, then all regions will use the same index step size; if `sampling` is an + array-like, its length must be equal to `M`, the first dimension in `regions`. + Default is 1, which will use all points. lam : float or None, optional The value for smoothing the calculated interpolated baseline using Whittaker smoothing, in order to reduce the kinks between regions. Default is None, which @@ -537,14 +541,18 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None Raises ------ ValueError - Raised if `rois` is not two dimensional or if `sampling` is not the same length - as `rois.shape[0]`. + Raised if `regions` is not two dimensional, if `sampling` is not the same length + as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if + segments in `regions` overlap, or if any value in `regions` is greater than the + length of the input data. Notes ----- Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS - as used in [31]_ since LOESS introduces additional kinks in the regions between the - smoothed and non-smoothed regions. + as used in [31]_. + + Uses binning rather than direct truncation of the regions in order to get better + results for noisy data. References ---------- @@ -557,8 +565,8 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None (classification, misc, morphological, polynomial, smooth, spline, whittaker), method_kwargs, True ) - roi_array = np.atleast_2d(rois) - roi_shape = roi_array.shape + roi = np.atleast_2d(regions) + roi_shape = roi.shape if len(roi_shape) != 2 or roi_shape[1] != 2: raise ValueError('rois must be a two dimensional sequence of (start, stop) values') @@ -566,22 +574,50 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None if np.any(steps < 1): raise ValueError('all step sizes in "sampling" must be >= 1') - # TODO maybe use binning (sum or average) to get a better signal to noise ratio rather - # than just indexing - # TODO should ensure steps are in increasing index order and do not overlap x_sections = [] + y_sections = [] x_mask = np.ones(self._len, dtype=bool) - for (start, stop), step in zip(roi_array, steps): + last_stop = -1 + include_first = True + include_last = True + for (start, stop), step in zip(roi, steps): if start is None: start = 0 if stop is None: stop = self._len - x_sections.append(np.arange(start, stop, step, dtype=np.intp)) + if start < last_stop: + raise ValueError('Sections cannot overlap') + else: + last_stop = stop + if start < 0 or stop < 0: + raise ValueError('values in regions must be positive') + elif stop > self._len: + raise ValueError('values in regions must be less than len(data)') + + sections = (stop - start) // step + if sections == 0: + sections = 1 # will create one section using the midpoint + indices = np.linspace(start, stop, sections + 1, dtype=np.intp) + for left_idx, right_idx in zip(indices[:-1], indices[1:]): + if left_idx == 0 and right_idx == 1: + include_first = False + elif right_idx == self._len and left_idx == self._len - 1: + include_last = False + y_sections.append(np.mean(y[left_idx:right_idx])) + x_sections.append(np.mean(self.x[left_idx:right_idx])) x_mask[start:stop] = False - x_sections.append(np.arange(self._len, dtype=np.intp)[x_mask]) - indices = np.unique(np.clip(np.concatenate(x_sections), 0, self._len - 1)) - y_fit = y[indices] - x_fit = self.x[indices] + + # ensure first and last indices are included in the fit to avoid edge effects + if include_first: + x_mask[0] = True + if include_last: + x_mask[-1] = True + x_sections.extend(self.x[x_mask]) + y_sections.extend(y[x_mask]) + x_fit = np.array(x_sections) + sort_order = np.argsort(x_fit, kind='mergesort') + x_fit = x_fit[sort_order] + y_fit = np.array(y_sections)[sort_order] # param sorting will be wrong, but most params that need sorting will have # no meaning since they correspond to a truncated dataset @@ -889,29 +925,31 @@ def adaptive_minmax(data, x_data=None, poly_order=None, method='modpoly', @_optimizers_wrapper -def custom_bc(data, x_data=None, method='asls', rois=(None, None), sampling=1, lam=None, +def custom_bc(data, x_data=None, method='asls', regions=((None, None),), sampling=1, lam=None, diff_order=2, method_kwargs=None): """ Customized baseline correction for fine tuned stiffness of the baseline at specific regions. + Divides the data into regions with variable number of data points and then uses other + baseline algorithms to fit the truncated data. Regions with less points effectively + makes the fit baseline more stiff in those regions. + Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. method : str, optional - [description]. Default is 'asls'. - rois : array-like, shape (M, 2), optional + A string indicating the algorithm to use for fitting the baseline; can be any + non-optimizer algorithm in pybaselines. Default is 'asls'. + regions : array-like, shape (M, 2), optional The two dimensional array containing the start and stop indices for each region of - interest. Each region is defined as ``data[start:stop]``. Default is (None, None), + interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),), which will use all points. sampling : int or array-like, optional - The sampling step size for each region defined in `rois`. If `sampling` is an integer, - then all regions in `rois` will use the same index step size; if `sampling` is an - array-like, its length must be equal to `M`, the first dimension in `rois`. Default - is 1, which will use all points. + The sampling step size for each region defined in `regions`. If `sampling` is an + integer, then all regions will use the same index step size; if `sampling` is an + array-like, its length must be equal to `M`, the first dimension in `regions`. + Default is 1, which will use all points. lam : float or None, optional The value for smoothing the calculated interpolated baseline using Whittaker smoothing, in order to reduce the kinks between regions. Default is None, which @@ -939,14 +977,18 @@ def custom_bc(data, x_data=None, method='asls', rois=(None, None), sampling=1, l Raises ------ ValueError - Raised if `rois` is not two dimensional or if `sampling` is not the same length - as `rois.shape[0]`. + Raised if `regions` is not two dimensional, if `sampling` is not the same length + as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if + segments in `regions` overlap, or if any value in `regions` is greater than the + length of the input data. Notes ----- Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS - as used in [4]_ since LOESS introduces additional kinks in the regions between the - smoothed and non-smoothed regions. + as used in [4]_. + + Uses binning rather than direct truncation of the regions in order to get better + results for noisy data. References ---------- diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index b31b051..6e54528 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -10,7 +10,7 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest -from pybaselines import optimizers, polynomial, utils +from pybaselines import optimizers, polynomial, utils, Baseline from .conftest import BaseTester, InputWeightsMixin @@ -378,15 +378,65 @@ def test_x_ordering(self): super().test_x_ordering(assertion_kwargs={'rtol': 1e-6}) @pytest.mark.parametrize('lam', (None, 1)) - def test_lam_inputs(self, lam): + def test_output_smoothing(self, lam): """Ensures the smoothing is done properly if specified.""" diff_order = 2 - output = self.class_func(self.y, lam=lam, diff_order=diff_order)[0] + output, params = self.class_func(self.y, method='asls', lam=lam, diff_order=diff_order) + truncated_baseline = Baseline(params['x_fit']).asls(params['y_fit'])[0] + expected_baseline = np.interp(self.x, params['x_fit'], truncated_baseline) if lam is not None: - non_smooth_output = self.class_func(self.y, lam=None, diff_order=diff_order)[0] - smoothed_output = utils.whittaker_smooth( - non_smooth_output, lam=lam, diff_order=diff_order + expected_baseline = utils.whittaker_smooth( + expected_baseline, lam=lam, diff_order=diff_order ) - assert_allclose(output, smoothed_output, rtol=1e-8, atol=1e-8) + assert_allclose(output, expected_baseline, rtol=1e-8, atol=1e-8) + + @pytest.mark.parametrize('roi_and_samplings', ( + [((None, None),), 5], + [((None, None),), 1], + [((None, None),), 10000000], + [((0, 20), (20, 30)), (3, 2)], + [((0, 1), (20, 30)), (3, 2)], + [((0, 20), (20, 30)), (33,)], + [((0, 20), (20, 30), (30, None)), (33, 5, 50)], + )) + def test_unique_x(self, roi_and_samplings): + """Ensures the fit uses only unique values and that x and y match dimensions.""" + regions, sampling = roi_and_samplings + output, params = self.class_func(self.y, regions=regions, sampling=sampling) + + assert_allclose(params['x_fit'], np.unique(params['x_fit']), rtol=1e-12, atol=1e-14) + assert params['x_fit'].shape == params['y_fit'].shape + assert len(params['x_fit']) > 2 # should at least include first, middle, and last values + + def test_roi_sampling_mixmatch_fails(self): + """Ensures an exception is raised if regions and sampling do not have the same shape.""" + with pytest.raises(ValueError): + self.class_func(self.y, regions=((None, None),), sampling=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, regions=((None, 10), (20, 30), (30, 40)), sampling=[1, 2]) + + @pytest.mark.parametrize('sampling', (-1, [-1], [5, -5])) + def test_negative_sampling_fails(self, sampling): + """Ensures an exception is raised if sampling is negative.""" + if isinstance(sampling, int): + num_samplings = 1 + else: + num_samplings = len(sampling) + regions = [] + for i in range(num_samplings): + regions.append([i * 10, (i + 1) * 10]) + with pytest.raises(ValueError): + self.class_func(self.y, regions=regions, sampling=sampling) + + @pytest.mark.parametrize('regions', (((-1, 5),), ((0, 10), (20, -30)), ((0, 10000),))) + def test_bad_region_values_fails(self, regions): + """Ensures an exception is raised if regions has a negative value or a too large value.""" + with pytest.raises(ValueError): + self.class_func(self.y, regions=regions) + + def test_overlapping_regions_fails(self): + """Ensures an exception is raised if regions overlap.""" + with pytest.raises(ValueError): + self.class_func(self.y, regions=((0, 10), (9, 13)))