MAINT: Use binning instead of direct truncation for custom_bc

Also finished the tests for custom_bc.
derb12 · Jan 28, 2024 · 2978e0f · 2978e0f
1 parent 0383117
commit 2978e0f
Show file tree

Hide file tree

Showing 2 changed files with 136 additions and 44 deletions.
diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py
@@ -489,27 +489,31 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None,
         return np.maximum.reduce(baselines), params
 
     @_Algorithm._register
-    def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None,
+    def custom_bc(self, data, method='asls', regions=((None, None),), sampling=1, lam=None,
                   diff_order=2, method_kwargs=None):
         """
         Customized baseline correction for fine tuned stiffness of the baseline at specific regions.
 
+        Divides the data into regions with variable number of data points and then uses other
+        baseline algorithms to fit the truncated data. Regions with less points effectively
+        makes the fit baseline more stiff in those regions.
+
         Parameters
         ----------
         data : array-like, shape (N,)
             The y-values of the measured data, with N data points.
         method : str, optional
             A string indicating the algorithm to use for fitting the baseline; can be any
             non-optimizer algorithm in pybaselines. Default is 'asls'.
-        rois : array-like, shape (M, 2), optional
+        regions : array-like, shape (M, 2), optional
             The two dimensional array containing the start and stop indices for each region of
-            interest. Each region is defined as ``data[start:stop]``. Default is (None, None),
+            interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),),
             which will use all points.
         sampling : int or array-like, optional
-            The sampling step size for each region defined in `rois`. If `sampling` is an integer,
-            then all regions in `rois` will use the same index step size; if `sampling` is an
-            array-like, its length must be equal to `M`, the first dimension in `rois`. Default
-            is 1, which will use all points.
+            The sampling step size for each region defined in `regions`. If `sampling` is an
+            integer, then all regions will use the same index step size; if `sampling` is an
+            array-like, its length must be equal to `M`, the first dimension in `regions`.
+            Default is 1, which will use all points.
         lam : float or None, optional
             The value for smoothing the calculated interpolated baseline using Whittaker
             smoothing, in order to reduce the kinks between regions. Default is None, which
@@ -537,14 +541,18 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None
         Raises
         ------
         ValueError
-            Raised if `rois` is not two dimensional or if `sampling` is not the same length
-            as `rois.shape[0]`.
+            Raised if `regions` is not two dimensional, if `sampling` is not the same length
+            as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if
+            segments in `regions` overlap, or if any value in `regions` is greater than the
+            length of the input data.
 
         Notes
         -----
         Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS
-        as used in [31]_ since LOESS introduces additional kinks in the regions between the
-        smoothed and non-smoothed regions.
+        as used in [31]_.
+
+        Uses binning rather than direct truncation of the regions in order to get better
+        results for noisy data.
 
         References
         ----------
@@ -557,31 +565,59 @@ def custom_bc(self, data, method='asls', rois=(None, None), sampling=1, lam=None
             (classification, misc, morphological, polynomial, smooth, spline, whittaker),
             method_kwargs, True
         )
-        roi_array = np.atleast_2d(rois)
-        roi_shape = roi_array.shape
+        roi = np.atleast_2d(regions)
+        roi_shape = roi.shape
         if len(roi_shape) != 2 or roi_shape[1] != 2:
             raise ValueError('rois must be a two dimensional sequence of (start, stop) values')
 
         steps = _check_scalar(sampling, roi_shape[0], fill_scalar=True, dtype=np.intp)[0]
         if np.any(steps < 1):
             raise ValueError('all step sizes in "sampling" must be >= 1')
 
-        # TODO maybe use binning (sum or average) to get a better signal to noise ratio rather
-        # than just indexing
-        # TODO should ensure steps are in increasing index order and do not overlap
         x_sections = []
+        y_sections = []
         x_mask = np.ones(self._len, dtype=bool)
-        for (start, stop), step in zip(roi_array, steps):
+        last_stop = -1
+        include_first = True
+        include_last = True
+        for (start, stop), step in zip(roi, steps):
             if start is None:
                 start = 0
             if stop is None:
                 stop = self._len
-            x_sections.append(np.arange(start, stop, step, dtype=np.intp))
+            if start < last_stop:
+                raise ValueError('Sections cannot overlap')
+            else:
+                last_stop = stop
+            if start < 0 or stop < 0:
+                raise ValueError('values in regions must be positive')
+            elif stop > self._len:
+                raise ValueError('values in regions must be less than len(data)')
+
+            sections = (stop - start) // step
+            if sections == 0:
+                sections = 1  # will create one section using the midpoint
+            indices = np.linspace(start, stop, sections + 1, dtype=np.intp)
+            for left_idx, right_idx in zip(indices[:-1], indices[1:]):
+                if left_idx == 0 and right_idx == 1:
+                    include_first = False
+                elif right_idx == self._len and left_idx == self._len - 1:
+                    include_last = False
+                y_sections.append(np.mean(y[left_idx:right_idx]))
+                x_sections.append(np.mean(self.x[left_idx:right_idx]))
             x_mask[start:stop] = False
-        x_sections.append(np.arange(self._len, dtype=np.intp)[x_mask])
-        indices = np.unique(np.clip(np.concatenate(x_sections), 0, self._len - 1))
-        y_fit = y[indices]
-        x_fit = self.x[indices]
+
+        # ensure first and last indices are included in the fit to avoid edge effects
+        if include_first:
+            x_mask[0] = True
+        if include_last:
+            x_mask[-1] = True
+        x_sections.extend(self.x[x_mask])
+        y_sections.extend(y[x_mask])
+        x_fit = np.array(x_sections)
+        sort_order = np.argsort(x_fit, kind='mergesort')
+        x_fit = x_fit[sort_order]
+        y_fit = np.array(y_sections)[sort_order]
 
         # param sorting will be wrong, but most params that need sorting will have
         # no meaning since they correspond to a truncated dataset
@@ -889,29 +925,31 @@ def adaptive_minmax(data, x_data=None, poly_order=None, method='modpoly',
 
 
 @_optimizers_wrapper
-def custom_bc(data, x_data=None, method='asls', rois=(None, None), sampling=1, lam=None,
+def custom_bc(data, x_data=None, method='asls', regions=((None, None),), sampling=1, lam=None,
               diff_order=2, method_kwargs=None):
     """
     Customized baseline correction for fine tuned stiffness of the baseline at specific regions.
 
+    Divides the data into regions with variable number of data points and then uses other
+    baseline algorithms to fit the truncated data. Regions with less points effectively
+    makes the fit baseline more stiff in those regions.
+
     Parameters
     ----------
     data : array-like, shape (N,)
         The y-values of the measured data, with N data points.
-    x_data : array-like, shape (N,), optional
-        The x-values of the measured data. Default is None, which will create an
-        array from -1 to 1 with N points.
     method : str, optional
-        [description]. Default is 'asls'.
-    rois : array-like, shape (M, 2), optional
+        A string indicating the algorithm to use for fitting the baseline; can be any
+        non-optimizer algorithm in pybaselines. Default is 'asls'.
+    regions : array-like, shape (M, 2), optional
         The two dimensional array containing the start and stop indices for each region of
-        interest. Each region is defined as ``data[start:stop]``. Default is (None, None),
+        interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),),
         which will use all points.
     sampling : int or array-like, optional
-        The sampling step size for each region defined in `rois`. If `sampling` is an integer,
-        then all regions in `rois` will use the same index step size; if `sampling` is an
-        array-like, its length must be equal to `M`, the first dimension in `rois`. Default
-        is 1, which will use all points.
+        The sampling step size for each region defined in `regions`. If `sampling` is an
+        integer, then all regions will use the same index step size; if `sampling` is an
+        array-like, its length must be equal to `M`, the first dimension in `regions`.
+        Default is 1, which will use all points.
     lam : float or None, optional
         The value for smoothing the calculated interpolated baseline using Whittaker
         smoothing, in order to reduce the kinks between regions. Default is None, which
@@ -939,14 +977,18 @@ def custom_bc(data, x_data=None, method='asls', rois=(None, None), sampling=1, l
     Raises
     ------
     ValueError
-        Raised if `rois` is not two dimensional or if `sampling` is not the same length
-        as `rois.shape[0]`.
+        Raised if `regions` is not two dimensional, if `sampling` is not the same length
+        as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if
+        segments in `regions` overlap, or if any value in `regions` is greater than the
+        length of the input data.
 
     Notes
     -----
     Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS
-    as used in [4]_ since LOESS introduces additional kinks in the regions between the
-    smoothed and non-smoothed regions.
+    as used in [4]_.
+
+    Uses binning rather than direct truncation of the regions in order to get better
+    results for noisy data.
 
     References
     ----------

diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
@@ -10,7 +10,7 @@
 from numpy.testing import assert_allclose, assert_array_equal
 import pytest
 
-from pybaselines import optimizers, polynomial, utils
+from pybaselines import optimizers, polynomial, utils, Baseline
 
 from .conftest import BaseTester, InputWeightsMixin
 
@@ -378,15 +378,65 @@ def test_x_ordering(self):
         super().test_x_ordering(assertion_kwargs={'rtol': 1e-6})
 
     @pytest.mark.parametrize('lam', (None, 1))
-    def test_lam_inputs(self, lam):
+    def test_output_smoothing(self, lam):
         """Ensures the smoothing is done properly if specified."""
         diff_order = 2
-        output = self.class_func(self.y, lam=lam, diff_order=diff_order)[0]
+        output, params = self.class_func(self.y, method='asls', lam=lam, diff_order=diff_order)
 
+        truncated_baseline = Baseline(params['x_fit']).asls(params['y_fit'])[0]
+        expected_baseline = np.interp(self.x, params['x_fit'], truncated_baseline)
         if lam is not None:
-            non_smooth_output = self.class_func(self.y, lam=None, diff_order=diff_order)[0]
-            smoothed_output = utils.whittaker_smooth(
-                non_smooth_output, lam=lam, diff_order=diff_order
+            expected_baseline = utils.whittaker_smooth(
+                expected_baseline, lam=lam, diff_order=diff_order
             )
 
-            assert_allclose(output, smoothed_output, rtol=1e-8, atol=1e-8)
+        assert_allclose(output, expected_baseline, rtol=1e-8, atol=1e-8)
+
+    @pytest.mark.parametrize('roi_and_samplings', (
+        [((None, None),), 5],
+        [((None, None),), 1],
+        [((None, None),), 10000000],
+        [((0, 20), (20, 30)), (3, 2)],
+        [((0, 1), (20, 30)), (3, 2)],
+        [((0, 20), (20, 30)), (33,)],
+        [((0, 20), (20, 30), (30, None)), (33, 5, 50)],
+    ))
+    def test_unique_x(self, roi_and_samplings):
+        """Ensures the fit uses only unique values and that x and y match dimensions."""
+        regions, sampling = roi_and_samplings
+        output, params = self.class_func(self.y, regions=regions, sampling=sampling)
+
+        assert_allclose(params['x_fit'], np.unique(params['x_fit']), rtol=1e-12, atol=1e-14)
+        assert params['x_fit'].shape == params['y_fit'].shape
+        assert len(params['x_fit']) > 2  # should at least include first, middle, and last values
+
+    def test_roi_sampling_mixmatch_fails(self):
+        """Ensures an exception is raised if regions and sampling do not have the same shape."""
+        with pytest.raises(ValueError):
+            self.class_func(self.y, regions=((None, None),), sampling=[1, 2])
+        with pytest.raises(ValueError):
+            self.class_func(self.y, regions=((None, 10), (20, 30), (30, 40)), sampling=[1, 2])
+
+    @pytest.mark.parametrize('sampling', (-1, [-1], [5, -5]))
+    def test_negative_sampling_fails(self, sampling):
+        """Ensures an exception is raised if sampling is negative."""
+        if isinstance(sampling, int):
+            num_samplings = 1
+        else:
+            num_samplings = len(sampling)
+        regions = []
+        for i in range(num_samplings):
+            regions.append([i * 10, (i + 1) * 10])
+        with pytest.raises(ValueError):
+            self.class_func(self.y, regions=regions, sampling=sampling)
+
+    @pytest.mark.parametrize('regions', (((-1, 5),), ((0, 10), (20, -30)), ((0, 10000),)))
+    def test_bad_region_values_fails(self, regions):
+        """Ensures an exception is raised if regions has a negative value or a too large value."""
+        with pytest.raises(ValueError):
+            self.class_func(self.y, regions=regions)
+
+    def test_overlapping_regions_fails(self):
+        """Ensures an exception is raised if regions overlap."""
+        with pytest.raises(ValueError):
+            self.class_func(self.y, regions=((0, 10), (9, 13)))