diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 7b002d2..0000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,24 +0,0 @@ -[bumpversion] -current_version = 1.0.0 -commit = False -tag = False - -[bumpversion:file:setup.cfg] -search = version = {current_version} -replace = version = {new_version} - -[bumpversion:file:pybaselines/__init__.py] -search = __version__ = '{current_version}' -replace = __version__ = '{new_version}' - -[bumpversion:file:docs/conf.py] -search = version = '{current_version}' -replace = version = '{new_version}' - -[bumpversion:file:CITATION.cff] -search = version: {current_version} -replace = version: {new_version} - -[bumpversion:file:docs/citing.rst] -search = version = {{{current_version}}} -replace = version = {{{new_version}}} diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 992bd03..fb02457 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -5,9 +5,8 @@ are required and/or what problems they fix. Link or add the issue number for any existing issues that the pull request solves. -Note that unsolicited pull requests will most likely be closed. -Please file an issue first, so that details can be discussed/finalized -before a pull request is created.--> +Note that it is preferred to file an issue first, so that details can be +discussed/finalized before a pull request is created.--> ### Type of Pull Request @@ -24,9 +23,9 @@ before a pull request is created.--> To run tests locally, type the following command within the pybaselines directory: pytest . -To lint files using flake8 to see if they pass PEP 8 standards and that +To lint files using ruff to see if they pass PEP 8 standards and that docstrings are okay, run the following command within the pybaselines -directory: flake8 . +directory: ruff check . To build documentation locally, type the following command within the docs directory: make html diff --git a/.github/workflows/python-test-latest.yml b/.github/workflows/python-test-latest.yml new file mode 100644 index 0000000..d62bcee --- /dev/null +++ b/.github/workflows/python-test-latest.yml @@ -0,0 +1,54 @@ +# For testing the nightly builds of numpy and scipy so that any new changes will not be +# a surprise. + +# Will only trigger if there is a change within pybaselines or tests directories. + +name: test-latest-dependencies + +on: + # allow manually activating the workflow + workflow_dispatch: + + push: + branches: [ main ] + paths: + - 'pybaselines/**' + - 'tests/**' + - '.github/workflows/**' + + pull_request: + # always trigger on a pull request, regardless of the branch + paths: + - 'pybaselines/**' + - 'tests/**' + - '.github/workflows/**' + +jobs: + test-nightly: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # Choose the latest stable python version + python-version: ['3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install required dependencies + run: | + python -m pip install --upgrade pip + python -m pip install pytest + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy + python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scipy + + - name: Test with required dependencies + # use -Werror so that any warnings will show up as errors -> want to be as stringent + # as possible + run: python -Werror -m pytest . diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 6b7ffcd..4120553 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -26,87 +26,88 @@ on: jobs: test: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: # Use strings since yaml considers 3.10 equal to 3.1 - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install required dependencies run: | python -m pip install --upgrade pip - python -m pip install "numpy>=1.14" "scipy>=1.0" pytest - - # Only lint a single version; pick a recent, stable version - - name: Install linting dependencies - id: install-linters - if: matrix.python-version == '3.10' - run: | - python -m pip install flake8 flake8-comprehensions flake8-docstrings - - - name: Lint - if: steps.install-linters.outcome == 'success' - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. - flake8 . --count --exit-zero --statistics + python -m pip install "numpy>=1.20" "scipy>=1.5" pytest - name: Test with required dependencies run: pytest . - name: Install optional dependencies id: install-optional - # uncomment below in case this step ever needs skipped again - if: matrix.python-version != '3.12' - run: python -m pip install "pentapy>=1.0" "numba>=0.45" + # uncomment below to allow skipping future versions + #if: matrix.python-version != '3.13' + run: python -m pip install "pentapy>=1.0" "numba>=0.49" - name: Test with optional dependencies - # uncomment below in case this step ever needs skipped again if: steps.install-optional.outcome == 'success' run: pytest . test-min-dependencies: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - python-version: ['3.6'] + python-version: ['3.8'] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install minimum dependencies - # Use numpy 1.14.5 rather than 1.14.0 since the optional - # dependency pentapy requires numpy>=1.14.5; no relevant difference - # between 1.14.0 and 1.14.5. run: | python -m pip install --upgrade pip - python -m pip install numpy==1.14.5 scipy==1.0 pytest + python -m pip install numpy==1.20 scipy==1.5 pytest - name: Test with minimum required dependencies run: pytest . - name: Install minimum optional dependencies - # Have to pin llvmlite to 0.30.0 since it otherwise gets a more recent - # version that is imcompatible with numba v0.45 - run: python -m pip install pentapy==1.0 numba==0.45 llvmlite==0.30.0 + run: python -m pip install pentapy==1.0 numba==0.49 - name: Test with minimum optional dependencies run: pytest . + + lint: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.11'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install linting dependencies + run: python -m pip install ruff + + - name: Lint + run: ruff check . --show-source diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ae4b173..1433d70 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.9" + python: "3.11" # Path to sphinx's configuration file sphinx: diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index e126a92..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,17 +0,0 @@ -include LICENSE.txt -include README.rst -include LICENSES_bundled.txt -include CHANGELOG.rst - -exclude .* - -recursive-exclude .git * -recursive-exclude .github * -recursive-exclude .pytest_cache * -recursive-exclude docs * -recursive-exclude examples * -recursive-exclude tests * -recursive-exclude requirements * -recursive-exclude tools * -recursive-exclude * __pycache__ -recursive-exclude * *.py[cod] \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index dcf792f..9410b27 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,7 +4,6 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx -SPHINXPROJ = mcetl SOURCEDIR = . BUILDDIR = _build diff --git a/docs/_templates/autoapi/python/package.rst b/docs/_templates/autoapi/python/package.rst index 592ccd0..29155bc 100644 --- a/docs/_templates/autoapi/python/package.rst +++ b/docs/_templates/autoapi/python/package.rst @@ -14,22 +14,6 @@ {% endif %} -{% block subpackages %} -{% set visible_subpackages = obj.subpackages|selectattr("display")|list %} -{% if visible_subpackages %} -Subpackages ------------ -.. toctree:: - :titlesonly: - :maxdepth: 3 - -{% for subpackage in visible_subpackages %} - {{ subpackage.short_name }}/index.rst -{% endfor %} - - -{% endif %} -{% endblock %} {% block submodules %} {% set visible_submodules = obj.submodules|selectattr("display")|list %} {% if visible_submodules %} @@ -47,6 +31,24 @@ Submodules {% endif %} {% endblock %} + +{% block subpackages %} +{% set visible_subpackages = obj.subpackages|selectattr("display")|list %} +{% if visible_subpackages %} +Subpackages +----------- +.. toctree:: + :titlesonly: + :maxdepth: 3 + +{% for subpackage in visible_subpackages %} + {{ subpackage.short_name }}/index.rst +{% endfor %} + + +{% endif %} +{% endblock %} + {% block content %} {% if obj.all is not none %} {% set visible_children = obj.children|selectattr("short_name", "in", obj.all)|list %} diff --git a/docs/algorithms/classification.rst b/docs/algorithms/classification.rst index 2de976f..da53dc7 100644 --- a/docs/algorithms/classification.rst +++ b/docs/algorithms/classification.rst @@ -65,7 +65,7 @@ Algorithms dietrich (Dietrich's Classification Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.dietrich` calculates the power spectrum of the data as the squared derivative +:meth:`~.Baseline.dietrich` calculates the power spectrum of the data as the squared derivative of the data. Then baseline points are identified by iteratively removing points where the mean of the power spectrum is less a multiple of the standard deviation of the power spectrum. The baseline is created by first interpolating through all baseline @@ -197,7 +197,7 @@ points, and then iteratively fitting a polynomial to the interpolated baseline. golotvin (Golotvin's Classification Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.golotvin` divides the data into sections and takes the minimum standard +:meth:`~.Baseline.golotvin` divides the data into sections and takes the minimum standard deviation of all the sections as the noise's standard deviation for the entire data. Then classifies any point where the rolling max minus min is less than a multiple of the noise's standard deviation as belonging to the baseline. @@ -224,7 +224,7 @@ the noise's standard deviation as belonging to the baseline. std_distribution (Standard Deviation Distribution) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.std_distribution` identifies baseline segments by analyzing the rolling +:meth:`~.Baseline.std_distribution` identifies baseline segments by analyzing the rolling standard deviation distribution. The rolling standard deviations are split into two distributions, with the smaller distribution assigned to noise. Baseline points are then identified as any point where the rolled standard deviation is less than a multiple @@ -253,8 +253,8 @@ of the median of the noise's standard deviation distribution. fastchrom (FastChrom's Baseline Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.fastchrom` identifies baseline segments by analyzing the rolling standard -deviation distribution, similar to :meth:`.std_distribution`. Baseline points are +:meth:`~.Baseline.fastchrom` identifies baseline segments by analyzing the rolling standard +deviation distribution, similar to :meth:`~.Baseline.std_distribution`. Baseline points are identified as any point where the rolling standard deviation is less than the specified threshold, and peak regions are iteratively interpolated until the baseline is below the data. @@ -279,7 +279,7 @@ threshold, and peak regions are iteratively interpolated until the baseline is b cwt_br (Continuous Wavelet Transform Baseline Recognition) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.cwt_br` identifies baseline segments by performing a continous wavelet +:meth:`~.Baseline.cwt_br` identifies baseline segments by performing a continous wavelet transform (CWT) on the input data at various scales, and picks the scale with the first local minimum in the Shannon entropy. The threshold for baseline points is obtained by fitting a Gaussian to the histogram of the CWT at the optimal scale, and the final baseline is fit @@ -315,8 +315,8 @@ other points have a weight of 0. fabc (Fully Automatic Baseline Correction) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.fabc` identifies baseline segments by thresholding the squared first derivative -of the data, similar to :meth:`.dietrich`. However, fabc approximates the first derivative +:meth:`~.Baseline.fabc` identifies baseline segments by thresholding the squared first derivative +of the data, similar to :meth:`~.Baseline.dietrich`. However, fabc approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust to noise than the numerical derivative in Dietrich's method. The baseline is then fit using Whittaker smoothing with all baseline points having a weight of 1 and all other points diff --git a/docs/algorithms/index.rst b/docs/algorithms/index.rst index 3594d1a..2b044df 100644 --- a/docs/algorithms/index.rst +++ b/docs/algorithms/index.rst @@ -2,15 +2,23 @@ Algorithms ========== -The currently available baseline correction algorithms in pybaselines are split into -polynomial, whittaker, morphological, smooth, spline, classification, optimizers, -and miscellaneous (misc). Note that this is more for grouping code and not meant as -a hard-classification of the algorithms. +The currently available baseline correction algorithms in pybaselines can broadly be categorized +as polynomial, whittaker, morphological, smooth, spline, classification, optimizers, +and miscellaneous (misc) methods. Note that this is simply for grouping code and helping to +explain the internals of this library and **NOT** meant as a hard-classification of the +field of baseline correction (Please stop blindly copying this section in papers. There are +numerous types of baseline correction algorithms that are not included within pybaselines, which +is why baseline correction in general is such an absolutely fascinating field! Besides, miscellaneous +is obviously not an actual type of baseline correction...) This section of the documentation is to help provide some context for each algorithm. In addition, most algorithms will have a figure that shows how well the algorithm fits -various baselines to help choose the correct algorithm for a particular baseline. Refer -to the :doc:`API section <../api/index>` of the documentation for the full parameter and +various datasets to help choose the correct algorithm for a particular baseline. These datasets +include noisy data, data with both positive and negative peaks, data with overlapping peaks, +and concave data, and they serve as a way to quickly filter out algorithms that would not +work for a particular dataset. + +Refer to the :doc:`API section <../api/index>` of the documentation for the full parameter and reference listing for any algorithm. diff --git a/docs/algorithms/misc.rst b/docs/algorithms/misc.rst index 3dcd6ff..c5ec012 100644 --- a/docs/algorithms/misc.rst +++ b/docs/algorithms/misc.rst @@ -11,7 +11,7 @@ Algorithms interp_pts (Interpolation between points) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.interp_pts` interpolates between input points using line segments +:meth:`~.Baseline.interp_pts` interpolates between input points using line segments or splines of different orders. The function is mainly intended for usage with user interfaces and is not encouraged otherwise. @@ -70,7 +70,7 @@ since it solely depends on the user-defined anchor points. beads (Baseline Estimation And Denoising with Sparsity) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.beads` decomposes the input data into baseline and pure, noise-free signal by +:meth:`~.Baseline.beads` decomposes the input data into baseline and pure, noise-free signal by modeling the baseline as a low pass filter and by considering the signal and its derivatives as sparse. diff --git a/docs/algorithms/morphological.rst b/docs/algorithms/morphological.rst index f634614..52202da 100644 --- a/docs/algorithms/morphological.rst +++ b/docs/algorithms/morphological.rst @@ -25,12 +25,12 @@ Algorithms mpls (Morphological Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mpls` uses both morphological operations and Whittaker-smoothing +:meth:`~.Baseline.mpls` uses both morphological operations and Whittaker-smoothing to create the baseline. First, a morphological opening is performed on the data. Then, the index of the minimum data value between each flat region of the opened data is selected as a baseline anchor point and given a weighting of :math:`1 - p`, while all other points are given a weight of :math:`p`. The data -and weights are then used to calculate the baseline, similar to the :meth:`.asls` +and weights are then used to calculate the baseline, similar to the :meth:`~.Baseline.asls` method. .. plot:: @@ -156,7 +156,7 @@ method. mor (Morphological) ~~~~~~~~~~~~~~~~~~~ -:meth:`.mor` performs a morphological opening on the data and then selects +:meth:`~.Baseline.mor` performs a morphological opening on the data and then selects the element-wise minimum between the opening and the average of a morphological erosion and dilation of the opening to create the baseline. @@ -183,7 +183,7 @@ erosion and dilation of the opening to create the baseline. imor (Improved Morphological) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.imor` is an attempt to improve the mor method, and iteratively selects the element-wise +:meth:`~.Baseline.imor` is an attempt to improve the mor method, and iteratively selects the element-wise minimum between the original data and the average of a morphological erosion and dilation of the opening of either the data (first iteration) or previous iteration's baseline to create the baseline. @@ -202,7 +202,7 @@ create the baseline. mormol (Morphological and Mollified Baseline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mormol` iteratively convolves the erosion of the data with a mollifying (smoothing) +:meth:`~.Baseline.mormol` iteratively convolves the erosion of the data with a mollifying (smoothing) kernel, to produce a smooth baseline. .. plot:: @@ -225,7 +225,7 @@ kernel, to produce a smooth baseline. amormol (Averaging Morphological and Mollified Baseline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.amormol` iteratively convolves a mollifying (smoothing) kernel with the +:meth:`~.Baseline.amormol` iteratively convolves a mollifying (smoothing) kernel with the element-wise minimum of the data and the average of the morphological closing and opening of either the data (first iteration) or previous iteration's baseline. @@ -243,7 +243,7 @@ and opening of either the data (first iteration) or previous iteration's baselin rolling_ball (Rolling Ball) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.rolling_ball` performs a morphological opening on the data and +:meth:`~.Baseline.rolling_ball` performs a morphological opening on the data and then smooths the result with a moving average, giving a baseline that resembles rolling a ball across the data. @@ -265,7 +265,7 @@ resembles rolling a ball across the data. mwmv (Moving Window Minimum Value) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mwmv` performs a morphological erosion on the data and +:meth:`~.Baseline.mwmv` performs a morphological erosion on the data and then smooths the result with a moving average. .. plot:: @@ -286,7 +286,7 @@ then smooths the result with a moving average. tophat (Top-hat Transformation) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.tophat` performs a morphological opening on the data. +:meth:`~.Baseline.tophat` performs a morphological opening on the data. .. note:: The baseline from the tophat method is not smooth. Smoothing is left to the @@ -311,13 +311,13 @@ tophat (Top-hat Transformation) mpspline (Morphology-Based Penalized Spline) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mpspline` uses both morphological operations and penalized splines +:meth:`~.Baseline.mpspline` uses both morphological operations and penalized splines to create the baseline. First, the data is smoothed by fitting a penalized spline to the closing of the data with a window of 3. Then baseline points are identified where the smoothed data is equal to the element-wise minimum between the opening of the smoothed data and the average of a morphological erosion and dilation of the opening. The baseline points are given a weighting of :math:`1 - p`, while all -other points are given a weight of :math:`p`, similar to the :meth:`.mpls` method. +other points are given a weight of :math:`p`, similar to the :meth:`~.Baseline.mpls` method. Finally, a penalized spline is fit to the smoothed data with the assigned weighting. .. plot:: @@ -349,7 +349,7 @@ Finally, a penalized spline is fit to the smoothed data with the assigned weight jbcd (Joint Baseline Correction and Denoising) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.jbcd` uses regularized least-squares fitting combined with morphological operations +:meth:`~.Baseline.jbcd` uses regularized least-squares fitting combined with morphological operations to simultaneously obtain the baseline and denoised signal. Minimized function: diff --git a/docs/algorithms/optimizers.rst b/docs/algorithms/optimizers.rst index c5b1eef..b43a97f 100644 --- a/docs/algorithms/optimizers.rst +++ b/docs/algorithms/optimizers.rst @@ -11,7 +11,7 @@ Algorithms optimize_extended_range ~~~~~~~~~~~~~~~~~~~~~~~ -The :meth:`.optimize_extended_range` function is based on the `Extended Range +The :meth:`~.Baseline.optimize_extended_range` function is based on the `Extended Range Penalized Least Squares (erPLS) method `_, but extends its usage to all Whittaker-smoothing-based, polynomial, and spline algorithms. @@ -201,7 +201,7 @@ added linear regions is selected as the optimal parameter. collab_pls (Collaborative Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.collab_pls` is intended for fitting multiple datasets of related data, +:meth:`~.Baseline.collab_pls` is intended for fitting multiple datasets of related data, and can use any Whittaker-smoothing-based or spline method. The general idea is that using multiple sets of data should be better able to estimate the overall baseline rather than individually fitting each set of data. @@ -258,7 +258,7 @@ since it requires multiple sets of data for each baseline type. adaptive_minmax (Adaptive MinMax) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.adaptive_minmax` uses two different polynomial orders and two different +:meth:`~.Baseline.adaptive_minmax` uses two different polynomial orders and two different weighting schemes to create a total of four fits. The polynomial order(s) can be specified by the user, or else they will be estimated by the signal-to-noise ratio of the data. The first weighting scheme is either all points weighted diff --git a/docs/algorithms/polynomial.rst b/docs/algorithms/polynomial.rst index b242952..75d2961 100644 --- a/docs/algorithms/polynomial.rst +++ b/docs/algorithms/polynomial.rst @@ -35,7 +35,7 @@ thresholding, or 3) penalyzing outliers. Selective Masking ~~~~~~~~~~~~~~~~~ -Selective masking is the oldest and most basic of the techniques. There +Selective masking is the simplest of the techniques. There are two ways to use selective masking in pybaselines. First, the input dataset can be trimmed/masked (easy to do with numpy) to not @@ -43,55 +43,10 @@ include any peak regions, the masked data can be fit, and then the resulting polynomial coefficients (must set ``return_coef`` to True) can be used to create a polynomial that spans the entirety of the original dataset. -.. code-block:: python - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - x_masked = x[non_peaks] - y_masked = y[non_peaks] - - # fit only the masked x and y - _, params = poly(y_masked, x_masked, poly_order=3, return_coef=True) - # recreate the polynomial using numpy and the full x-data - baseline = np.polynomial.Polynomial(params['coef'])(x) - - fig, ax = plt.subplots(tight_layout={'pad': 0.2}) - data_handle = ax.plot(y) - baseline_handle = ax.plot(baseline, '--') - masked_y = y.copy() - masked_y[~non_peaks] = np.nan - masked_handle = ax.plot(masked_y) - ax.set_yticks([]) - ax.set_xticks([]) - ax.legend( - (data_handle[0], masked_handle[0], baseline_handle[0]), - ('data', 'non-peak regions', 'fit baseline'), frameon=False - ) - plt.show() - - .. plot:: :align: center + :context: reset + :include-source: True import numpy as np import matplotlib.pyplot as plt @@ -123,6 +78,9 @@ a polynomial that spans the entirety of the original dataset. # recreate the polynomial using numpy and the full x-data baseline = np.polynomial.Polynomial(params['coef'])(x) + # Alternatively, just use numpy: + # baseline = np.polynomial.Polynomial.fit(x_masked, y_masked, 3)(x) + fig, ax = plt.subplots(tight_layout={'pad': 0.2}) data_handle = ax.plot(y) baseline_handle = ax.plot(baseline, '--') @@ -141,80 +99,19 @@ a polynomial that spans the entirety of the original dataset. The second way is to keep the original data, and input a custom weight array into the fitting function with values equal to 0 in peak regions and 1 in baseline regions. -.. code-block:: python - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - weights = np.zeros(y.shape[0]) - weights[non_peaks] = 1 - # directly create baseline by inputting weights - baseline = poly(y, x, poly_order=3, weights=weights)[0] - - fig, ax = plt.subplots(tight_layout={'pad': 0.2}) - data_handle = ax.plot(y) - baseline_handle = ax.plot(baseline, '--') - masked_y = y.copy() - masked_y[~non_peaks] = np.nan - masked_handle = ax.plot(masked_y) - ax.set_yticks([]) - ax.set_xticks([]) - ax.legend( - (data_handle[0], masked_handle[0], baseline_handle[0]), - ('data', 'non-peak regions', 'fit baseline'), frameon=False - ) - plt.show() - - .. plot:: :align: center + :context: close-figs + :include-source: True - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - from pybaselines.polynomial import poly - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 800, 10) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise - - # bitwise "or" (|) and "and" (&) operators for indexing numpy array - non_peaks = ( - (x < 150) | ((x > 210) & (x < 310)) - | ((x > 440) & (x < 650)) | (x > 840) - ) - weights = np.zeros(y.shape[0]) + weights = np.zeros(len(y)) weights[non_peaks] = 1 - + # directly create baseline by inputting weights baseline = poly(y, x, poly_order=3, weights=weights)[0] + # Alternatively, just use numpy: + # baseline = np.polynomial.Polynomial.fit(x, y, 3, w=weights)(x) + fig, ax = plt.subplots(tight_layout={'pad': 0.2}) data_handle = ax.plot(y) baseline_handle = ax.plot(baseline, '--') @@ -234,7 +131,7 @@ As seen above, both ways produce the same resulting baseline, but the second way (setting weights) is much easier and faster since the baseline is directly calculated. The only algorithm in pybaselines that requires using selective masking is -:meth:`.poly`, which is normal polynomial least-squares fitting as described +:meth:`~.Baseline.poly`, which is normal polynomial least-squares fitting as described above. However, all other polynomial techniques allow inputting custom weights in order to get better fits or to reduce the number of iterations. @@ -254,25 +151,8 @@ The figure below illustrates the iterative thresholding. .. plot:: :align: center - - import numpy as np - import matplotlib.pyplot as plt - from pybaselines.utils import gaussian - - x = np.linspace(1, 1000, 500) - signal = ( - gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 10) - + gaussian(x, 6, 550, 5) - + gaussian(x, 9, 800, 10) - + gaussian(x, 9, 100, 12) - + gaussian(x, 15, 400, 8) - + gaussian(x, 13, 700, 12) - + gaussian(x, 9, 880, 8) - ) - real_baseline = 5 + 15 * np.exp(-x / 400) - noise = np.random.default_rng(1).normal(0, 0.2, x.size) - y = signal + real_baseline + noise + :context: close-figs + :include-source: False fig, axes = plt.subplots( 2, 2, gridspec_kw={'hspace': 0, 'wspace': 0}, @@ -296,15 +176,15 @@ The figure below illustrates the iterative thresholding. plt.show() -The algorithms in pybaselines that use thresholding are :meth:`.modpoly`, -:meth:`.imodpoly`, and :meth:`.loess` (if ``use_threshold`` is True). +The algorithms in pybaselines that use thresholding are :meth:`~.Baseline.modpoly`, +:meth:`~.Baseline.imodpoly`, and :meth:`~.Baseline.loess` (if ``use_threshold`` is True). Penalyzing Outliers ~~~~~~~~~~~~~~~~~~~ The algorithms in pybaselines that penalyze outliers are -:meth:`.penalized_poly`, which incorporate the penalty directly into the -minimized cost function, and :meth:`.loess` (if ``use_threshold`` is False), +:meth:`~.Baseline.penalized_poly`, which incorporate the penalty directly into the +minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False), which incorporates penalties by applying lower weights to outliers. Refer to the particular algorithms below for more details. @@ -315,7 +195,7 @@ Algorithms poly (Regular Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.poly` is simple least-squares polynomial fitting. Use selective +:meth:`~.Baseline.poly` is simple least-squares polynomial fitting. Use selective masking, as described above, in order to use it for baseline fitting. Note that the plots below are just the least-squared polynomial fitting @@ -441,7 +321,7 @@ of the data since masking is time-consuming. modpoly (Modified Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.modpoly` uses thresholding, as explained above, to iteratively fit a polynomial +:meth:`~.Baseline.modpoly` uses thresholding, as explained above, to iteratively fit a polynomial baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature, and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterative Polynomial Fit". @@ -463,7 +343,7 @@ baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature, imodpoly (Improved Modified Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.imodpoly` is an attempt to improve the modpoly algorithm for noisy data, +:meth:`~.Baseline.imodpoly` is an attempt to improve the modpoly algorithm for noisy data, by including the standard deviation of the residual (data - baseline) when performing the thresholding. The number of standard deviations included in the thresholding can be adjusted by setting ``num_std``. `imodpoly` is also sometimes called "IModPolyFit" in literature, @@ -492,7 +372,7 @@ and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterati penalized_poly (Penalized Polynomial) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.penalized_poly` (sometimes referred to as "backcor" in literature) fits a +:meth:`~.Baseline.penalized_poly` (sometimes referred to as "backcor" in literature) fits a polynomial baseline to data using non-quadratic cost functions. Compared to the quadratic cost function used in typical least-squares as discussed above, non-quadratic cost funtions allow outliers above a user-defined threshold to have less effect on the fit. pentalized_poly @@ -608,7 +488,7 @@ The plots below show the symmetric and asymmetric forms of the cost functions. loess (Locally Estimated Scatterplot Smoothing) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.loess` (sometimes referred to as "rbe" or "robust baseline estimate" in literature) +:meth:`~.Baseline.loess` (sometimes referred to as "rbe" or "robust baseline estimate" in literature) is similar to `traditional loess/lowess `_ but adapted for fitting the baseline. The baseline at each point is estimated by using polynomial regression on the k-nearest neighbors of the point, and the effect of outliers @@ -651,7 +531,7 @@ is reduced by iterative reweighting. quant_reg (Quantile Regression) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.quant_reg` fits a polynomial to the baseline using quantile regression. +:meth:`~.Baseline.quant_reg` fits a polynomial to the baseline using quantile regression. .. plot:: :align: center @@ -674,8 +554,8 @@ quant_reg (Quantile Regression) goldindec (Goldindec Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.goldindec` fits a polynomial baseline to data using non-quadratic cost functions, -similar to :meth:`.penalized_poly`, except that it only allows asymmetric cost functions. +:meth:`~.Baseline.goldindec` fits a polynomial baseline to data using non-quadratic cost functions, +similar to :meth:`~.Baseline.penalized_poly`, except that it only allows asymmetric cost functions. The optimal threshold value between quadratic and non-quadratic loss is iteratively optimized based on the input `peak_ratio` value. diff --git a/docs/algorithms/smooth.rst b/docs/algorithms/smooth.rst index d42e2f0..0cff0b7 100644 --- a/docs/algorithms/smooth.rst +++ b/docs/algorithms/smooth.rst @@ -5,10 +5,6 @@ Smoothing Baselines The contents of :mod:`pybaselines.smooth` contain algorithms that use smoothing to eliminate peaks and leave only the baseline. -.. note:: - The module pybaselines.smooth was named pybaselines.window until version 0.6.0. - - .. note:: The window size used for smoothing-based algorithms is index-based, rather than based on the units of the data, so proper conversions must be done @@ -21,7 +17,7 @@ Algorithms noise_median (Noise Median method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.noise_median` estimates the baseline as the median value within +:meth:`~.Baseline.noise_median` estimates the baseline as the median value within a moving window. The resulting baseline is then smoothed by convolving with a Gaussian kernel. Note that this method does not perform well for tightly-grouped peaks. @@ -149,7 +145,7 @@ kernel. Note that this method does not perform well for tightly-grouped peaks. snip (Statistics-sensitive Non-linear Iterative Peak-clipping) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.snip` iteratively takes the element-wise minimimum of each value +:meth:`~.Baseline.snip` iteratively takes the element-wise minimimum of each value and the average of the values at the left and right edge of a window centered at the value. The size of the half-window is incrementally increased from 1 to the specified maximum size, which should be set to approximately half of the @@ -195,7 +191,7 @@ data. The baselines when using decreasing window size and smoothing is shown bel swima (Small-Window Moving Average) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.swima` iteratively takes the element-wise minimum of either the +:meth:`~.Baseline.swima` iteratively takes the element-wise minimum of either the data (first iteration) or the previous iteration's baseline and the data/previous baseline smoothed with a moving average. The window used for the moving average smoothing is incrementally increased to smooth peaks until convergence is reached. @@ -219,7 +215,7 @@ incrementally increased to smooth peaks until convergence is reached. ipsa (Iterative Polynomial Smoothing Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.ipsa` iteratively smooths the input data using a second-order +:meth:`~.Baseline.ipsa` iteratively smooths the input data using a second-order Savitzky–Golay filter until the exit criteria is reached. .. plot:: @@ -240,7 +236,7 @@ Savitzky–Golay filter until the exit criteria is reached. ria (Range Independent Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.ria` first extrapolates a linear baseline from the left and/or +:meth:`~.Baseline.ria` first extrapolates a linear baseline from the left and/or right edges of the data and adds Gaussian peaks to these baselines, similar to the :ref:`optimize_extended_range ` function, and records their initial areas. The data is then iteratively smoothed using a diff --git a/docs/algorithms/spline.rst b/docs/algorithms/spline.rst index a9ed8b1..560f88f 100644 --- a/docs/algorithms/spline.rst +++ b/docs/algorithms/spline.rst @@ -66,7 +66,7 @@ Algorithms mixture_model (Mixture Model) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.mixture_model` considers the data as a mixture model composed of +:meth:`~.Baseline.mixture_model` considers the data as a mixture model composed of a baseline with noise and peaks. The weighting for the penalized spline fitting the baseline is iteratively determined by fitting the residual with a normal distribution centered at 0 (representing the noise), and a uniform distribution @@ -203,7 +203,7 @@ residual belonging to the noise's normal distribution. irsqr (Iterative Reweighted Spline Quantile Regression) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.irsqr` uses penalized splines and iterative reweighted least squares +:meth:`~.Baseline.irsqr` uses penalized splines and iterative reweighted least squares to perform quantile regression on the data. .. plot:: @@ -227,7 +227,7 @@ to perform quantile regression on the data. corner_cutting (Corner-Cutting Method) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.corner_cutting` iteratively removes corner points and then creates +:meth:`~.Baseline.corner_cutting` iteratively removes corner points and then creates a quadratic Bezier spline from the remaining points. Continuity between the individual Bezier curves is maintained by adding control points halfway between all but the first and last non-corner points. @@ -253,7 +253,7 @@ between all but the first and last non-corner points. pspline_asls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_asls` is a penalized spline version of :meth:`.asls`. +:meth:`~.Baseline.pspline_asls` is a penalized spline version of :meth:`~.Baseline.asls`. Minimized function: @@ -301,7 +301,7 @@ Weighting: pspline_iasls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_iasls` is a penalized spline version of :meth:`.iasls`. +:meth:`~.Baseline.pspline_iasls` is a penalized spline version of :meth:`~.Baseline.iasls`. Minimized function: @@ -354,7 +354,7 @@ Weighting: pspline_airpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_airpls` is a penalized spline version of :meth:`.airpls`. +:meth:`~.Baseline.pspline_airpls` is a penalized spline version of :meth:`~.Baseline.airpls`. Minimized function: @@ -401,7 +401,7 @@ values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - z pspline_arpls (Penalized Spline Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_arpls` is a penalized spline version of :meth:`.arpls`. +:meth:`~.Baseline.pspline_arpls` is a penalized spline version of :meth:`~.Baseline.arpls`. Minimized function: @@ -445,7 +445,7 @@ values in the residual vector :math:`\mathbf r`. pspline_drpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_drpls` is a penalized spline version of :meth:`.drpls`. +:meth:`~.Baseline.pspline_drpls` is a penalized spline version of :meth:`~.Baseline.drpls`. Minimized function: @@ -501,7 +501,7 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`. pspline_iarpls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_iarpls` is a penalized spline version of :meth:`.iarpls`. +:meth:`~.Baseline.pspline_iarpls` is a penalized spline version of :meth:`~.Baseline.iarpls`. Minimized function: @@ -549,7 +549,7 @@ the residual vector :math:`\mathbf r`. pspline_aspls (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_aspls` is a penalized spline version of :meth:`.aspls`. +:meth:`~.Baseline.pspline_aspls` is a penalized spline version of :meth:`~.Baseline.aspls`. Minimized function: @@ -614,7 +614,7 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better). pspline_psalsa (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_psalsa` is a penalized spline version of :meth:`.psalsa`. +:meth:`~.Baseline.pspline_psalsa` is a penalized spline version of :meth:`~.Baseline.psalsa`. Minimized function: @@ -661,7 +661,7 @@ be considered a peak. pspline_derpsalsa (Penalized Spline Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_derpsalsa` is a penalized spline version of :meth:`.derpsalsa`. +:meth:`~.Baseline.pspline_derpsalsa` is a penalized spline version of :meth:`~.Baseline.derpsalsa`. Minimized function: @@ -723,7 +723,7 @@ respectively, of the smoothed data, :math:`y_{sm}`, and :math:`rms()` is the roo pspline_mpls (Penalized Spline Morphological Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.pspline_mpls` is a penalized spline version of :meth:`.mpls`. +:meth:`~.Baseline.pspline_mpls` is a penalized spline version of :meth:`~.Baseline.mpls`. Minimized function: diff --git a/docs/algorithms/whittaker.rst b/docs/algorithms/whittaker.rst index cd3e821..513cf01 100644 --- a/docs/algorithms/whittaker.rst +++ b/docs/algorithms/whittaker.rst @@ -3,18 +3,21 @@ Whittaker Baselines =================== The contents of :mod:`pybaselines.whittaker` contain Whittaker-smoothing-based -algorithms for fitting the baseline. +algorithms for fitting the baseline. Note that Whittaker smoothing is often +also referred to as Whittaker-Henderson smoothing. Introduction ------------ -Whittaker-smoothing-based (WSB) algorithms are usually referred to in literature +Whittaker-smoothing-based algorithms are usually referred to in literature as weighted least squares, penalized least squares, or asymmetric least squares, -but are referred to as WSB in pybaselines to distinguish them from polynomial -techniques that also take advantage of weighted least squares (like :meth:`.loess`) -and penalized least squares (like :meth:`.penalized_poly`). +but are referred to as Whittaker-smoothing-based in pybaselines to distinguish them from polynomial +techniques that also take advantage of weighted least squares (like :meth:`~.Baseline.loess`) +and penalized least squares (like :meth:`~.Baseline.penalized_poly`). -The general idea behind WSB algorithms is to make the baseline match the measured +A great introduction to Whittaker smoothing is Paul Eilers's +`A Perfect Smoother paper `_. The general idea behind Whittaker +smoothing algorithms is to make the baseline match the measured data as well as it can while also penalizing the roughness of the baseline. The resulting general function that is minimized to determine the baseline is then @@ -57,21 +60,21 @@ and :math:`D_2` (second order difference matrix) is: 0 & 0 & 1 & -2 & 1 \\ \end{bmatrix} -Most WSB techniques recommend using the second order difference matrix, although -some techniques use both the first and second order difference matrices. +Most Whittaker-smoothing-based techniques recommend using the second order difference matrix, +although some techniques use both the first and second order difference matrices. The baseline is iteratively calculated using the linear system above by solving for the baseline, :math:`z`, updating the weights, solving for the baseline using the new weights, and repeating until some exit criteria. -The difference between WSB algorithms is the selection of weights and/or the -function that is minimized. +The difference between Whittaker-smoothing-based algorithms is the selection of weights +and/or the function that is minimized. .. note:: - The :math:`\lambda` (``lam``) value required to fit a particular baseline for all WSB - methods will increase as the number of data points increases, with the relationship - being roughly :math:`\log(\lambda) \propto \log(\text{number of data points})`. For example, - a ``lam`` value of :math:`10^3` that fits a dataset with 100 points may have to be :math:`10^7` - to fit the same data with 1000 points, and :math:`10^{11}` for 10000 points. + The :math:`\lambda` (``lam``) value required to fit a particular baseline for all + Whittaker-smoothing-based methods will increase as the number of data points increases, with + the relationship being roughly :math:`\log(\lambda) \propto \log(\text{number of data points})`. + For example, a ``lam`` value of :math:`10^3` that fits a dataset with 100 points may have to + be :math:`10^7` to fit the same data with 1000 points, and :math:`10^{11}` for 10000 points. Algorithms @@ -80,7 +83,7 @@ Algorithms asls (Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The :meth:`.asls` (sometimes called "ALS" in literature) function is the +The :meth:`~.Baseline.asls` (sometimes called "ALS" in literature) function is the original implementation of Whittaker smoothing for baseline fitting. Minimized function: @@ -230,7 +233,7 @@ Weighting: iasls (Improved Asymmetric Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.iasls` is an attempt to improve the asls algorithm by considering +:meth:`~.Baseline.iasls` is an attempt to improve the asls algorithm by considering both the roughness of the baseline and the first derivative of the residual (data - baseline). @@ -285,7 +288,7 @@ Weighting: airpls (Adaptive Iteratively Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.airpls` uses an exponential weighting of the negative residuals to +:meth:`~.Baseline.airpls` uses an exponential weighting of the negative residuals to attempt to provide a better fit than the asls method. Minimized function: @@ -326,7 +329,7 @@ values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - z arpls (Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.arpls` uses a single weighting function that is designed to account +:meth:`~.Baseline.arpls` uses a single weighting function that is designed to account for noisy data. Minimized function: @@ -369,7 +372,7 @@ deviation, respectively, of the negative values in the residual vector :math:`\m drpls (Doubly Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.drpls` uses a single weighting function that is designed to account +:meth:`~.Baseline.drpls` uses a single weighting function that is designed to account for noisy data, similar to arpls. Further, it takes into account both the first and second derivatives of the baseline and uses a parameter :math:`\eta` to adjust the fit in peak versus non-peak regions. @@ -426,7 +429,7 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`. iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.iarpls` is an attempt to improve the arpls method, which has a tendency +:meth:`~.Baseline.iarpls` is an attempt to improve the arpls method, which has a tendency to overestimate the baseline when fitting small peaks in noisy data, by using an adjusted weighting formula. @@ -471,7 +474,7 @@ the residual vector :math:`\mathbf r`. aspls (Adaptive Smoothness Penalized Least Squares) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.aspls`, similar to the iarpls method, is an attempt to improve the arpls method, +:meth:`~.Baseline.aspls`, similar to the iarpls method, is an attempt to improve the arpls method, which it does by using an adjusted weighting function and an additional parameter :math:`\alpha`. Minimized function: @@ -527,7 +530,7 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better). psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.psalsa` is an attempt at improving the asls method to better fit noisy data +:meth:`~.Baseline.psalsa` is an attempt at improving the asls method to better fit noisy data by using an exponential decaying weighting for positive residuals. Minimized function: @@ -573,7 +576,7 @@ be considered a peak. derpsalsa (Derivative Peak-Screening Asymmetric Least Squares Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`.derpsalsa` is an attempt at improving the asls method to better fit noisy data +:meth:`~.Baseline.derpsalsa` is an attempt at improving the asls method to better fit noisy data by using an exponential decaying weighting for positive residuals. Further, it calculates additional weights based on the first and second derivatives of the data. diff --git a/docs/algorithms_2d/index.rst b/docs/algorithms_2d/index.rst new file mode 100644 index 0000000..8720c68 --- /dev/null +++ b/docs/algorithms_2d/index.rst @@ -0,0 +1,26 @@ +============= +2D Algorithms +============= + +pybaselines extends a subset of the 1D baseline correction algorithms to work with +2D data. Note that this is only intended for data in which there is some global baseline; +otherwise, it is more appropriate and usually significantly faster to simply use the 1D +algorithms on each individual row and/or column in the data. + +This section of the documentation is to help provide some context for how the algorithms +were extended to work with two dimensional data. It will not be as comprehensive as the +:doc:`1D Algorithms section <../algorithms/index>`, so to help understand any algorithm, +it is suggested to start there. Refer to the :doc:`API section <../api/index>` of the +documentation for the full parameter and reference listing for any algorithm. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + polynomial_2d + whittaker_2d + morphological_2d + spline_2d + smooth_2d + optimizers_2d diff --git a/docs/algorithms_2d/morphological_2d.rst b/docs/algorithms_2d/morphological_2d.rst new file mode 100644 index 0000000..c150a33 --- /dev/null +++ b/docs/algorithms_2d/morphological_2d.rst @@ -0,0 +1,123 @@ +======================= +Morphological Baselines +======================= + +.. note:: + All morphological algorithms use a ``half_window`` parameter to define the size + of the window used for the morphological operators. ``half_window`` is index-based, + rather than based on the units of the data, so proper conversions must be done + by the user to get the desired window size. + + +Algorithms +---------- + +mor (Morphological) +~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.mor`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.mor(y, half_window=(6, 4)) + create_plots(y, baseline) + + +imor (Improved Morphological) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.imor`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.imor(y, half_window=(4, 2), tol=5e-3) + create_plots(y, baseline) + + +rolling_ball (Rolling Ball) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.rolling_ball`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.rolling_ball(y, half_window=(8, 5), smooth_half_window=3) + create_plots(y, baseline) + + +tophat (Top-hat Transformation) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.tophat`: +:ref:`explanation for the algorithm `. + + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.tophat(y, half_window=(8, 5)) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/optimizers_2d.rst b/docs/algorithms_2d/optimizers_2d.rst new file mode 100644 index 0000000..84be92f --- /dev/null +++ b/docs/algorithms_2d/optimizers_2d.rst @@ -0,0 +1,77 @@ +=================== +Optimizer Baselines +=================== + +Algorithms +---------- + +collab_pls (Collaborative Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.collab_pls`: +:ref:`explanation for the algorithm `. +There is no figure showing a fit for for this method since it requires multiple sets of data. + +adaptive_minmax (Adaptive MinMax) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.adaptive_minmax`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.adaptive_minmax(y, poly_order=(2, 3)) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/polynomial_2d.rst b/docs/algorithms_2d/polynomial_2d.rst new file mode 100644 index 0000000..1c60bff --- /dev/null +++ b/docs/algorithms_2d/polynomial_2d.rst @@ -0,0 +1,159 @@ +==================== +Polynomial Baselines +==================== + +Introduction +------------ + +In 2D, a polynomial can be expressed as + +.. math:: + + p(x, z) = \sum\limits_{i = 0}^{d_r} \sum\limits_{j = 0}^{d_c} {\beta_{i, j} x^i z^j} + +where :math:`\beta` is the matrix of coefficients for the polynomial and :math:`d_r` +and :math:`d_c` are the polynomial degrees for the rows (:math:`x`) and +columns (:math:`z`), respectively. + +For regular polynomial fitting, the polynomial coefficients that best fit data +are gotten from minimizing the least-squares: + +.. math:: \sum\limits_{i}^M \sum\limits_{j}^N w_{ij}^2 (y_{ij} - p(x_i, z_j))^2 + +where :math:`y_{ij}`, :math:`x_i`, and :math:`z_j` are the measured data, :math:`p(x_i, z_j)` is +the polynomial estimate at :math:`x_i`, and :math:`z_j` and :math:`w_{ij}` is the weighting. + + +However, since only the baseline of the data is desired, the least-squares +approach must be modified. For polynomial-based algorithms, this is done +by 1) only fitting the data in regions where there is only baseline, 2) +modifying the y-values being fit each iteration, or 3) penalyzing outliers. + +.. note:: + For two dimensional data, polynomial algorithms take a single ``poly_order`` + parameter that can either be a single number, in which case both the rows and columns + will use the same polynomial degree, ie. :math:`d_r = d_c`, or a sequence + of two numbers (:math:`d_r`, :math:`d_c`) to use different polynomials along + the rows and columns. Further, ``max_cross`` can be set to limit the polynomial + coefficients for the cross terms. + +Algorithms +---------- + +poly (Regular Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.poly`: +:ref:`explanation for the algorithm `. No plot +will be shown since it is just a simple least-squares polynomial fitting. + + +modpoly (Modified Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.modpoly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.modpoly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + +imodpoly (Improved Modified Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.imodpoly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.imodpoly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + + +penalized_poly (Penalized Polynomial) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.penalized_poly`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.penalized_poly(y, poly_order=(1, 2), max_cross=0) + create_plots(y, baseline) + + +quant_reg (Quantile Regression) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.quant_reg`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.quant_reg( + y, poly_order=(1, 2), max_cross=0, quantile=0.3 + ) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/smooth_2d.rst b/docs/algorithms_2d/smooth_2d.rst new file mode 100644 index 0000000..953211d --- /dev/null +++ b/docs/algorithms_2d/smooth_2d.rst @@ -0,0 +1,76 @@ +=================== +Smoothing Baselines +=================== + +.. note:: + The window size used for smoothing-based algorithms is index-based, rather + than based on the units of the data, so proper conversions must be done + by the user to get the desired window size. + + +Algorithms +---------- + +noise_median (Noise Median method) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.noise_median`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.noise_median(y, half_window=12, smooth_half_window=5) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/spline_2d.rst b/docs/algorithms_2d/spline_2d.rst new file mode 100644 index 0000000..638bd5b --- /dev/null +++ b/docs/algorithms_2d/spline_2d.rst @@ -0,0 +1,250 @@ +================ +Spline Baselines +================ + +Introduction +------------ + +The two dimensional extension of penalized splines (P-splines) for baseline correction +within pybaselines follows the framework of Eilers, Currie, and Durbán +from `[1] `_. + +Let the number of rows be :math:`M` and the number of columns :math:`N` within the matrix +of measured data :math:`Y`. Note that :math:`y` is the flattened array of matrix :math:`Y` +with length :math:`M * N`. Let :math:`Y` be a function of :math:`x` along the rows and :math:`z` +along the columns, ie. :math:`Y_{ij} = f(x_i, z_j)`, and :math:`B_r(x)` and :math:`B_c(z)` represent +the spline basis matrices along the rows and columns, respectively, each with a number of +knots :math:`g` and `h`. Analogous to the 1D case, the goal is to make the baseline, :math:`V` match the measured +data as well as it can while also penalizing the difference between spline coefficients, resulting +in the following minimization: + +.. math:: + + \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - \sum\limits_{k}^g \sum\limits_{l}^h B_{r,k}(x_i) B_{c,l}(z_j) \alpha_{kl})^2 + + \lambda_r \sum\limits_{i}^{g - d_r} (\alpha_{i\bullet} \Delta^{d_r})^2 + + \lambda_c \sum\limits_{j}^{h - d_c} (\Delta^{d_c} \alpha_{j\bullet})^2 + +and + +.. math:: + + V = \sum\limits_{i}^g \sum\limits_{j}^h B_{r,i} B_{c,j} \alpha_{ij} + + +where :math:`Y_{ij}` is the measured data, :math:`\alpha` is the matrix of spline coefficients, +:math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the +penalty along the columns, :math:`W_{ij}` is the weighting, :math:`\Delta^{d_r}` is the finite-difference +operator of order :math:`d_r` along each row of :math:`\alpha`, :math:`\alpha_{i\bullet}`, and :math:`\Delta^{d_c}` is the +finite-difference operator of order :math:`d_c` along each column of :math:`\alpha`, :math:`\alpha_{j\bullet}`. + +Let :math:`B = B_c \otimes B_r` denote the kronecker product of the basis matrices for the columns and rows, +which represents the overall two dimensional tensor product spline basis. The resulting linear equation for +solving the above minimization is: + +.. math:: + + (B^{\top} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B^{\top} W_{diag} y + +and the baseline is then: + +.. math:: + + v = B \alpha + +where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, :math:`v` is the flattened +estimated baseline, and :math:`D_d` is the matrix version of :math:`\Delta^d`, as already explained for +the :ref:`1D case `. Further, :math:`\otimes` denotes the Kronecker +product, and :math:`I_g` and :math:`I_h` are the identity matrices of length :math:`g` and +:math:`h`, respectively. After solving, the array :math:`v` can then be reshaped into the matrix :math:`V`. + +Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the above equation +can be optimized following `[1] `_ and expressed as a +`generalized linear array model `_ +which allows directly using the matrices of the measured data, :math:`Y`, and the weights, +:math:`W`, rather than flattening them, which significantly reduces the required +memory and computation time. + +.. _generalized-linear-array-model-explanation: + +Let :math:`F` be the +`face-splitting product operator `_ +of a matrix with iself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\top}) \odot (1_{g}^{\top} \otimes B_r)` +and :math:`F(B_c) = (B_c \otimes 1_{h}^{\top}) \odot (1_{h}^{\top} \otimes B_c)`, where +:math:`1_g` and :math:`1_h` are vectors of ones of length :math:`g` and :math:`h`, respecitvely, +and :math:`\odot` signifies elementwise multiplication. Then the linear equation can be rewritten as: + +.. math:: + + (F(B_r)^{\top} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B_{r}^{\top} (W \odot Y) B_c + +and the baseline is: + +.. math:: + + V = B_r \alpha B_{c}^{\top} + + +Algorithms +---------- + +mixture_model (Mixture Model) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.mixture_model`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.mixture_model(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +irsqr (Iterative Reweighted Spline Quantile Regression) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.irsqr`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.irsqr(y, lam=(1e3, 1e2), quantile=0.3) + create_plots(y, baseline) + + +pspline_asls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_asls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_asls(y, lam=(1e3, 1e0), p=0.005) + create_plots(y, baseline) + + +pspline_iasls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_iasls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_iasls(y, lam=(1e2, 1e-2)) + create_plots(y, baseline) + + +pspline_airpls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_airpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_airpls(y, lam=(1e3, 1e-1)) + create_plots(y, baseline) + + +pspline_arpls (Penalized Spline Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_arpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_arpls(y, lam=(1e3, 5e0)) + create_plots(y, baseline) + + +pspline_iarpls (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_iarpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_iarpls(y, lam=(1e2, 1e0)) + create_plots(y, baseline) + + +pspline_psalsa (Penalized Spline Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.pspline_psalsa`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + baseline, params = baseline_fitter.pspline_psalsa(y, lam=(1e3, 5e0), k=0.5) + create_plots(y, baseline) diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst new file mode 100644 index 0000000..18211ec --- /dev/null +++ b/docs/algorithms_2d/whittaker_2d.rst @@ -0,0 +1,278 @@ +=================== +Whittaker Baselines +=================== + +Introduction +------------ + +Excellent introductory papers on two dimensional penalized least squares are +`[1] `_ and +`[2] `_. Whittaker-smoothing-based +algorithms are extended to two dimensional data as follows: + +Let the number of rows be :math:`M` and the number of columns :math:`N` within the matrix +of measured data :math:`Y`. Note that :math:`y` is the flattened array of matrix :math:`Y` +with length :math:`M * N`. Analogous to the 1D case, the goal is to make the baseline match +the measured data as well as it can while also penalizing the roughness of the baseline, resulting +in the following minimization: + +.. math:: + + \sum\limits_{i}^M \sum\limits_{j}^N W_{ij} (Y_{ij} - V_{ij})^2 + + \lambda_r \sum\limits_{i}^{M - d_r} (V_{i\bullet} \Delta^{d_r})^2 + + \lambda_c \sum\limits_{j}^{N - d_c} (\Delta^{d_c} V_{j\bullet})^2 + +where :math:`Y_{ij}` is the measured data, :math:`V_{ij}` is the estimated baseline, +:math:`\lambda_r` is the penalty along the rows, :math:`\lambda_c` is the penalty along the columns, +:math:`W_{ij}` is the weighting, :math:`\Delta^{d_r}` is the finite-difference operator of order +:math:`d_r` along each row of :math:`V`, :math:`V_{i\bullet}`, and :math:`\Delta^{d_c}` is the +finite-difference operator of order :math:`d_c` along each column of :math:`V`, :math:`V_{j\bullet}`. + +The resulting linear equation for solving the above minimization is: + +.. math:: + + (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_M) v = w y + + +where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix +version of :math:`\Delta^d`, as already explained for the :ref:`1D case `. +Further, :math:`\otimes` denotes the `Kronecker product `_, +and :math:`I_M` and :math:`I_N` are the identity matrices of length :math:`M` and :math:`N`, respectively. +After solving, the array :math:`v` can then be reshaped into the matrix :math:`V`. + +Since the analytical solution for 2D requires matrices of shape :math:`(M*N, M*N)`, it is quite +memory and computationally expensive to solve. Although the left hand side of the equation is +still sparse and symmetric, it cannot be solved as easily compared to the 1D case since the +bandwidth is no longer small due to the penalties along both the rows and columns (plus the +sparse solver currently available in SciPy cannot make use of the symmetric nature of the matrix; +using `Cholesky factorization `_ does provide a speed +up but still does not scale well above ~500x500 sized matrices). However... + +Eigendecomposition +~~~~~~~~~~~~~~~~~~ + +By following the excellent insights laid out by G. Biessy in `[2] `_, +the dimensionality of the system can be reduced by using eigendecomposition on each of the two +penalty matrices, :math:`D_{d_r}^{\top} D_{d_r}` and :math:`D_{d_c}^{\top} D_{d_c}`. (Note that speeding up +Whittaker smoothing using `factorization in 1D `_ and using the +`analytical eigenvalues in nD (great paper) `_ are established +methods, although they require using a fixed difference order, and, in the second case, of using +different boundary conditions that unfortunately do not translate well from smoothing to baseline correction). +The general eigendecomposition of the penalty matrix gives + +.. math:: + + D_{d}^{\top} D_{d} = U \Sigma U^{\top} + +where :math:`U` is the matrix of eigenvectors and :math:`\Sigma` is a diagonal matrix +with the eigenvalues along the diagonal. Letting :math:`B = U_c \otimes U_r` denote the kronecker +product of the eigenvector matrices of the penalty for the columns and rows, and :math:`g` and +:math:`h` denote the number of eigenvectors along the rows and columns, respectively, the linear equation +can be rewritten as: + +.. math:: + + (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\top} W_{diag} y + +and the baseline is then: + +.. math:: + + v = B \alpha + +The beauty of this reparameterization when applied to baseline correction is twofold: + +1) The number of eigenvalues required to approximate the analytical solution depends on + the required smoothness, ie. some constant approximated by :math:`\lambda / (\text{number of data points})` + that does not appreciably change with data size. Baselines require much less smoothness than + smoothing, so the number of eigenvalues is relatively low (from testing, ~5-10 for low order + polynomial baselines and ~15-25 for sinusoidal baselines). +2) Since experimental data is measured on gridded data (ie. :math:`Y_{ij} = f(x_i, z_j)`), the + above equation can be further optimized by expressing it as a + `generalized linear array model `_, + following the brilliant insights of `Eilers, Currie, and Durbán `_, + exactly as :ref:`explained for 2D penalized splines `. + + +.. note:: + For two dimensional data, Whittaker-smoothing-based algorithms take a single ``lam``, + parameter that can either be a single number, in which case both the rows and columns + will use the same smoothing parameter, ie. :math:`\lambda_r = \lambda_c`, or a sequence + of two numbers (:math:`\lambda_r`, :math:`\lambda_c`) to use different values for the + rows and columns. + +Algorithms +---------- + +asls (Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.asls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: reset + + import numpy as np + import matplotlib.pyplot as plt + from pybaselines.utils import gaussian2d + from pybaselines import Baseline2D + + + def create_data(): + x = np.linspace(-20, 20, 80) + z = np.linspace(-20, 20, 80) + X, Z = np.meshgrid(x, z, indexing='ij') + signal = ( + gaussian2d(X, Z, 12, -9, -9) + + gaussian2d(X, Z, 11, 3, 3) + + gaussian2d(X, Z, 13, 11, 11) + + gaussian2d(X, Z, 8, 5, -11, 1.5, 1) + + gaussian2d(X, Z, 16, -8, 8) + ) + baseline = 0.1 + 0.08 * X - 0.05 * Z + 0.005 * (Z + 20)**2 + noise = np.random.default_rng(0).normal(scale=0.1, size=signal.shape) + y = signal + baseline + noise + + return x, z, y, baseline + + + def create_plots(y, fit_baseline): + X, Z = np.meshgrid( + np.arange(y.shape[0]), np.arange(y.shape[1]), indexing='ij' + ) + + # 4 total plots: 2 countours and 2 projections + row_names = ('Raw Data', 'Baseline Corrected') + for i, dataset in enumerate((y, y - fit_baseline)): + fig = plt.figure(layout='constrained', figsize=plt.figaspect(0.5)) + fig.suptitle(row_names[i]) + ax = fig.add_subplot(1 ,2, 2) + ax.contourf(X, Z, dataset, cmap='coolwarm') + ax.set_xticks([]) + ax.set_yticks([]) + ax_2 = fig.add_subplot(1, 2, 1, projection='3d') + ax_2.plot_surface(X, Z, dataset, cmap='coolwarm') + ax_2.set_xticks([]) + ax_2.set_yticks([]) + ax_2.set_zticks([]) + if i == 0: + pass#ax.set_title('Contours') + #ax_2.set_title('3D Projections') + + + x, z, y, real_baseline = create_data() + baseline_fitter = Baseline2D(x, z, check_finite=False) + + baseline, params = baseline_fitter.asls(y, lam=(1e2, 1e1), p=0.001) + create_plots(y, baseline) + + +iasls (Improved Asymmetric Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.iasls`: +:ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_data function, look at the top-most algorithm's code + baseline, params = baseline_fitter.iasls(y, lam=(1e3, 1e0)) + create_plots(y, baseline) + + +airpls (Adaptive Iteratively Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.airpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.airpls(y, lam=(1e3, 1e1)) + create_plots(y, baseline) + + +arpls (Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.arpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.arpls(y, lam=(1e4, 1e2)) + create_plots(y, baseline) + + +drpls (Doubly Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.drpls`: +:ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.drpls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.iarpls`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.iarpls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +aspls (Adaptive Smoothness Penalized Least Squares) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.aspls`: +:ref:`explanation for the algorithm `. +Eigendecomposition is not allowed for this method. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.aspls(y, lam=(1e3, 1e2)) + create_plots(y, baseline) + + +psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~.Baseline2D.psalsa`: +:ref:`explanation for the algorithm `. + +.. plot:: + :align: center + :context: close-figs + + # to see contents of create_plots function, look at the top-most algorithm's code + baseline, params = baseline_fitter.psalsa(y, lam=(1e3, 1e2), k=0.5) + create_plots(y, baseline) diff --git a/docs/conf.py b/docs/conf.py index 16a31f7..b8867a4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,16 +33,15 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ #'sphinx.ext.autodoc', - #'sphinx.ext.autosummary', + #'sphinx.ext.autosummary', # use autoapi instead of autodoc and autosummary 'autoapi.extension', 'sphinx.ext.intersphinx', - 'sphinx.ext.napoleon', + #'sphinx.ext.napoleon', # use numpydoc instead + 'numpydoc', 'sphinx.ext.todo', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', - # austosectionlabel allows referencing sections by their titles, but was throwing lots of duplicate object warnings - # since changes.rst and CHANGELOG.rst have the same section headers - #'sphinx.ext.autosectionlabel', + 'sphinx.ext.autosectionlabel', 'matplotlib.sphinxext.plot_directive', 'sphinx_gallery.gen_gallery' ] @@ -136,15 +135,28 @@ #'special-members', # show things like __str__ #'imported-members', # document things imported within each module ] -autoapi_member_order = 'groupwise' # groups into classes, functions, etc. -autoapi_python_class_content = 'class' # include class docstring from class and/or __init__ -#autoapi_keep_files = True # keep the files after generation -#autoapi_add_toctree_entry = False # need to manually add to toctree if False -#autoapi_generate_api_docs = False # will not generate new docs when False +autoapi_member_order = 'groupwise' # groups into classes, functions, etc. +autoapi_python_class_content = 'class' # include class docstring from class and/or __init__ +autoapi_keep_files = False # keep the files after generation +autoapi_add_toctree_entry = True # need to manually add to toctree if False +autoapi_generate_api_docs = True # will not generate new docs when False # ignore an import warning from sphinx-autoapi due to double import of utils -suppress_warnings = ['autoapi.python_import_resolution'] +suppress_warnings = ['autoapi.python_import_resolution', 'autosectionlabel'] +# -- Settings for matplotlib plot_directive extension ---------------------------- + +plot_include_source = False + +plot_formats = ['png'] + +# -- Settings for numpydoc extension ---------------------------- + +# uses the matplotlib plot_directive extension when "import matplotlib" is in a docstring +numpydoc_use_plots = True + +# creates cross references for types in docstrings +numpydoc_xref_param_type = False # -- Settings for sphinx-gallery extension ---------------------------- @@ -197,7 +209,7 @@ html_theme = 'nature' else: html_theme = 'sphinx_rtd_theme' - del sphinx_rtd_theme + # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -214,7 +226,6 @@ #'_static' ] - # -- Options for HTMLHelp output --------------------------------------- # Output file base name for HTML help builder. diff --git a/docs/contributing.rst b/docs/contributing.rst index ea6bb37..ceb7326 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -7,7 +7,8 @@ Contributions are welcomed and greatly appreciated. Bugs Reports/Feedback ~~~~~~~~~~~~~~~~~~~~~ -Report bugs or give feedback by filing an issue at https://github.com/derb12/pybaselines/issues. +Report bugs, ask questions, or give feedback by filing an issue +at https://github.com/derb12/pybaselines/issues. If you are reporting a bug, please include: @@ -23,8 +24,7 @@ If you are proposing a feature: Pull Requests ~~~~~~~~~~~~~ -Pull requests are welcomed for this project, but please note that -unsolicited pull requests are discouraged. Please file an issue first, +Pull requests are welcomed for this project. Generally, it is preferred to file an issue first, so that details can be discussed/finalized before a pull request is created. Any new code or documentation must be able to be covered by the BSD 3-clause license @@ -35,18 +35,21 @@ When submitting a pull request, follow similar procedures for a feature request, * Explain in detail how it works. * Keep the scope as narrow as possible to make it easier to incorporate. +The following sections will detail how to setup a development environment for contributing +code to pybaselines and all of the potential checks to run. -Set Up Development Environment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To clone the GitHub repository and install the necessary libraries for development: +Setting Up Development Environment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To clone the GitHub repository and install the necessary libraries for development, +ensure `git `_ is installed and then run: .. code-block:: console git clone https://github.com/derb12/pybaselines.git cd pybaselines - pip install -r requirements/requirements-development.txt - pip install -e . + pip install .[dev] All sections below assume the above commands were ran. @@ -65,13 +68,13 @@ terminal while in the pybaselines directory: .. code-block:: console - flake8 . --statistics + ruff check . Testing ^^^^^^^ -If implementing a new feature, please add any necessary tests. To check that tests pass +If adding new code, please add any necessary tests. To check that tests pass locally, run the following command in the terminal while in the pybaselines directory: .. code-block:: console @@ -103,7 +106,7 @@ Documentation If submitting changes to the documentation or adding documentation for a new feature/algorithm, please ensure the documentation builds locally by running the following command while in the -docs directory: +``pybaselines/docs`` directory: .. code-block:: console diff --git a/docs/index.rst b/docs/index.rst index c9d3ae0..9c5d1cc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,6 +23,7 @@ pybaselines is a library of algorithms for the baseline correction of experiment quickstart parameter_selection algorithms/index + algorithms_2d/index examples/index api/index contributing diff --git a/docs/installation.rst b/docs/installation.rst index 0d4573e..b2e7f2d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -8,10 +8,10 @@ Installation Dependencies ~~~~~~~~~~~~ -pybaselines requires `Python `_ version 3.6 or later and the following libraries: +pybaselines requires `Python `_ version 3.8 or later and the following libraries: -* `NumPy `_ (>= 1.14) -* `SciPy `_ (>= 1.0) +* `NumPy `_ (>= 1.20) +* `SciPy `_ (>= 1.5) All of the required libraries should be automatically installed when @@ -22,16 +22,16 @@ Optional Dependencies pybaselines has the following optional dependencies: -* `numba `_ (>= 0.45): +* `numba `_ (>= 0.49): speeds up calculations used by the following functions: - * :meth:`.loess` - * :meth:`.dietrich` - * :meth:`.golotvin` - * :meth:`.std_distribution` - * :meth:`.fastchrom` - * :meth:`.beads` - * :meth:`.mpspline` + * :meth:`~Baseline.loess` + * :meth:`~Baseline.dietrich` + * :meth:`~Baseline.golotvin` + * :meth:`~Baseline.std_distribution` + * :meth:`~Baseline.fastchrom` + * :meth:`~Baseline.beads` + * :meth:`~Baseline.mpspline` * all functions in :mod:`pybaselines.spline` * `pentapy `_ (>= 1.0): @@ -39,9 +39,9 @@ pybaselines has the following optional dependencies: used by the following functions (when ``diff_order=2``): * all functions in :mod:`pybaselines.whittaker` - * :meth:`.mpls` - * :meth:`.jbcd` - * :meth:`.fabc` + * :meth:`~Baseline.mpls` + * :meth:`~Baseline.jbcd` + * :meth:`~Baseline.fabc` Stable Release @@ -76,7 +76,8 @@ Development Version The sources for pybaselines can be downloaded from the `GitHub repo `_. -To directly install the current version of pybaselines from GitHub, run: +To directly install the current version of pybaselines from GitHub, +ensure `git `_ is installed and then run: .. code-block:: console diff --git a/docs/make.bat b/docs/make.bat index ef0cc78..7d2f9a1 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -9,7 +9,6 @@ if "%SPHINXBUILD%" == "" ( ) set SOURCEDIR=. set BUILDDIR=_build -set SPHINXPROJ=mcetl if "%1" == "" goto help diff --git a/examples/classification/plot_classifier_masks.py b/examples/classification/plot_classifier_masks.py index f0d8a18..852bf5d 100644 --- a/examples/classification/plot_classifier_masks.py +++ b/examples/classification/plot_classifier_masks.py @@ -3,7 +3,7 @@ Classification masks -------------------- -The baseline algorithms in the :mod:`.classification` module estimate the baseline +The baseline algorithms in the :mod:`~pybaselines.classification` module estimate the baseline by classifying each point as belonging to either the baseline or the peaks. When first using a function, the correct parameters may not be known. To make the effects of input parameters on the classification process more easily understood, all functions diff --git a/examples/classification/plot_fastchrom_threshold.py b/examples/classification/plot_fastchrom_threshold.py index ef321e4..a1fc853 100644 --- a/examples/classification/plot_fastchrom_threshold.py +++ b/examples/classification/plot_fastchrom_threshold.py @@ -3,7 +3,7 @@ fastchrom threshold ------------------- -:meth:`.fastchrom` classifies baseline points based on their rolling standard +:meth:`~.Baseline.fastchrom` classifies baseline points based on their rolling standard deviation value. The default threshold for fastchrom is set to the fifteenth percentile of the rolling standard deviation distribution. This default is rather conservative in assigning diff --git a/examples/general/plot_algorithm_convergence.py b/examples/general/plot_algorithm_convergence.py index 0fcd575..cd99290 100644 --- a/examples/general/plot_algorithm_convergence.py +++ b/examples/general/plot_algorithm_convergence.py @@ -9,8 +9,8 @@ the measured tolerance value at each iteration. The `tol_history` parameter can be helpful for determining appropriate `max_iter` or `tol` values. -In this example, the convergence of the :meth:`.asls` and :meth:`.aspls` functions -will be compared. asls is a relatively simple calculation that sets its weighting +In this example, the convergence of the :meth:`~.Baseline.asls` and :meth:`~.Baseline.aspls` +functions will be compared. asls is a relatively simple calculation that sets its weighting each iteration based on whether the current baseline is above or below the input data at each point. aspls has a much more intricate weighting based on the logistic distribution of the residuals (data minus baseline); further, aspls also updates an additional diff --git a/examples/general/plot_noisy_data.py b/examples/general/plot_noisy_data.py index d2012de..7557e35 100644 --- a/examples/general/plot_noisy_data.py +++ b/examples/general/plot_noisy_data.py @@ -8,8 +8,8 @@ This example will show how to reduce this issue by simply smoothing the data before performing baseline correction. -Two algorithms will be compared: :meth:`.modpoly`, which is not suited for noisy -data, and :meth:`.imodpoly`, which is a modification of the modpoly algorithm +Two algorithms will be compared: :meth:`~.Baseline.modpoly`, which is not suited for noisy +data, and :meth:`~.Baseline.imodpoly`, which is a modification of the modpoly algorithm created specifically to address noise. """ diff --git a/examples/misc/plot_beads_preprocessing.py b/examples/misc/plot_beads_preprocessing.py index 7c63200..d5ade81 100644 --- a/examples/misc/plot_beads_preprocessing.py +++ b/examples/misc/plot_beads_preprocessing.py @@ -3,13 +3,13 @@ Preprocessing for beads ----------------------- -The Baseline Estimation And Denoising with Sparsity (:meth:`.beads`) algorithm is a +The Baseline Estimation And Denoising with Sparsity (:meth:`~.Baseline.beads`) algorithm is a robust method for both performing baseline subtraction and removing noise. One of the main drawbacks of the original algorithm is that it requires that both ends of the data to be at zero. This example will explore the consequences of this as well as a preprocessing step proposed by `Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms using the BEADS algorithm. Journal of Chromatography -A, 2017, 1507, 1-10` that helps to address this issue. +A, 2017, 1507, 1-10` implemented in pybaselines that helps to address this issue. """ # sphinx_gallery_thumbnail_number = 4 diff --git a/examples/morphological/plot_half_window_effects.py b/examples/morphological/plot_half_window_effects.py index 4702153..e301ace 100644 --- a/examples/morphological/plot_half_window_effects.py +++ b/examples/morphological/plot_half_window_effects.py @@ -6,7 +6,7 @@ This example shows the influence of the `half_window` parameter that is used when fitting any morphological algorithm. -For this example, the :meth:`.mor` algorithm will be used, which is a relatively +For this example, the :meth:`~.Baseline.mor` algorithm will be used, which is a relatively robust baseline algorithm. """ diff --git a/examples/spline/plot_lam_vs_num_knots.py b/examples/spline/plot_lam_vs_num_knots.py index fe89965..525ce57 100644 --- a/examples/spline/plot_lam_vs_num_knots.py +++ b/examples/spline/plot_lam_vs_num_knots.py @@ -5,7 +5,7 @@ This example will examine the effects of `lam` for fitting a penalized spline baseline while varying both the number of knots for the spline, `num_knots`, and the number of -data points. The function :meth:`.mixture_model` is used for all calculations. +data points. The function :meth:`~.Baseline.mixture_model` is used for all calculations. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/spline/plot_pspline_whittaker.py b/examples/spline/plot_pspline_whittaker.py index e1dd6b9..3c78c0d 100644 --- a/examples/spline/plot_pspline_whittaker.py +++ b/examples/spline/plot_pspline_whittaker.py @@ -8,8 +8,8 @@ for doing so was that P-splines offer additional user flexibility when choosing parameters for fitting and more easily work for unequally spaced data. This example will examine the relationship of `lam` versus the number of data points when fitting -a baseline with the :meth:`.arpls` function and its P-spline version, -:meth:`.pspline_arpls`. +a baseline with the :meth:`~.Baseline.arpls` function and its P-spline version, +:meth:`~.Baseline.pspline_arpls`. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/whittaker/plot_lam_effects.py b/examples/whittaker/plot_lam_effects.py index e19f92f..6a13d1b 100644 --- a/examples/whittaker/plot_lam_effects.py +++ b/examples/whittaker/plot_lam_effects.py @@ -8,7 +8,7 @@ exact `lam` values used in this example are unimportant, just the changes in their scale. -For this example, the :meth:`.arpls` algorithm will be used, which performs +For this example, the :meth:`~.Baseline.arpls` algorithm will be used, which performs well in the presence of noise. """ diff --git a/examples/whittaker/plot_lam_vs_data_size.py b/examples/whittaker/plot_lam_vs_data_size.py index 1ac2668..3d2eaba 100644 --- a/examples/whittaker/plot_lam_vs_data_size.py +++ b/examples/whittaker/plot_lam_vs_data_size.py @@ -9,7 +9,7 @@ Whittaker-smoothing-based algorithm is dependent on the number of data points. Thus, this can cause issues when adapting an algorithm to a new set of data since the published optimal `lam` value is not universal. This example shows an analysis of this dependence -for all available functions in the :mod:`.whittaker` module. +for all available functions in the :mod:`pybaselines.whittaker` module. Note that the exact optimal `lam` values reported in this example are not of significant use since they depend on many other factors such as the baseline curvature, noise, peaks, diff --git a/examples/whittaker/plot_whittaker_solvers.py b/examples/whittaker/plot_whittaker_solvers.py index a732f06..191aad5 100644 --- a/examples/whittaker/plot_whittaker_solvers.py +++ b/examples/whittaker/plot_whittaker_solvers.py @@ -7,7 +7,7 @@ the banded structure of the linear system to reduce the computation time. This example shows the difference in computation times of the asymmetic least squares -(:meth:`.asls`) algorithm when using the banded solver from Scipy (solveh_banded) +(:meth:`~.Baseline.asls`) algorithm when using the banded solver from Scipy (solveh_banded) and the banded solver from the optional dependency `pentapy `_. In addition, the time it takes when solving the system using sparse matrices rather than the banded matrices @@ -17,6 +17,10 @@ is ~50-70% faster and pentapy's banded solver is ~70-90% faster, ultimately reducing the computation time by about an order of magnitude. +Note that the performance of solving the sparse system can be improved by using +`CHOLMOD from SuiteSparse `_, which has +Python bindings provided by `scikit-sparse `_. + """ import time @@ -24,7 +28,6 @@ import matplotlib.pyplot as plt import numpy as np -from scipy.sparse import spdiags from scipy.sparse.linalg import spsolve from pybaselines import whittaker, _banded_utils @@ -51,12 +54,11 @@ def sparse_asls(data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weig diff_matrix = difference_matrix(num_y, diff_order, 'csc') penalty_matrix = lam * (diff_matrix.T @ diff_matrix) + original_diag = penalty_matrix.diagonal() tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = spsolve( - spdiags(weight_array, 0, num_y, num_y, 'csr') + penalty_matrix, - weight_array * y, 'NATURAL' - ) + penalty_matrix.setdiag(weight_array + original_diag) + baseline = spsolve(penalty_matrix, weight_array * y, 'NATURAL') mask = y > baseline new_weights = p * mask + (1 - p) * (~mask) calc_difference = relative_difference(weight_array, new_weights) @@ -111,7 +113,8 @@ def make_data(num_x): if not _banded_utils._HAS_PENTAPY: warnings.warn( - 'pentapy is not installed so pentapy and scipy-banded timings will be identical' + 'pentapy is not installed so pentapy and scipy-banded timings will be identical', + stacklevel=2 ) # equation obtained following similar procedure as `lam` vs data size example diff --git a/pybaselines/__init__.py b/pybaselines/__init__.py index 854aed3..66cb073 100644 --- a/pybaselines/__init__.py +++ b/pybaselines/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- """ +======================================================================================= pybaselines - A library of algorithms for the baseline correction of experimental data. ======================================================================================= @@ -102,3 +103,4 @@ ) from .api import Baseline +from .two_d.api import Baseline2D diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index 8b25ca0..0f9f247 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -23,9 +23,12 @@ from ._banded_utils import PenalizedSystem from ._spline_utils import PSpline from ._validation import ( - _check_array, _check_half_window, _check_optional_array, _check_sized_array, _yx_arrays + _check_array, _check_half_window, _check_optional_array, _check_scalar_variable, + _check_sized_array, _yx_arrays +) +from .utils import ( + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array, optimize_window, pad_edges ) -from .utils import ParameterWarning, _inverted_sort, optimize_window, pad_edges class _Algorithm: @@ -42,10 +45,11 @@ class _Algorithm: that no polynomial fitting has been performed. pspline : PSpline or None The PSpline object for setting up and solving penalized spline algorithms. Is None - if no penalized spline setup has been performed (typically done in :meth:`._setup_spline`). + if no penalized spline setup has been performed (typically done in + :meth:`~_Algorithm._setup_spline`). vandermonde : numpy.ndarray or None The Vandermonde matrix for solving polynomial equations. Is None if no polynomial - setup has been performed (typically done in :meth:`._setup_polynomial`). + setup has been performed (typically done in :meth:`~_Algorithm._setup_polynomial`). whittaker_system : PenalizedSystem or None The PenalizedSystem object for setting up and solving Whittaker-smoothing-based algorithms. Is None if no Whittaker setup has been performed (typically done in @@ -97,9 +101,9 @@ def __init__(self, x_data=None, check_finite=True, assume_sorted=False, self._sort_order = None self._inverted_order = None else: - self._sort_order = self.x.argsort(kind='mergesort') - self.x = self.x[self._sort_order] - self._inverted_order = _inverted_sort(self._sort_order) + self._sort_order, self._inverted_order = _determine_sorts(self.x) + if self._sort_order is not None: + self.x = self.x[self._sort_order] self.whittaker_system = None self.vandermonde = None @@ -138,7 +142,7 @@ def pentapy_solver(self, value): self.whittaker_system.pentapy_solver = value self._pentapy_solver = value - def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_sorting=False): + def _return_results(self, baseline, params, dtype, sort_keys=(), skip_sorting=False): """ Re-orders the input baseline and parameters based on the x ordering. @@ -150,13 +154,11 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s The baseline output by the baseline function. params : dict The parameter dictionary output by the baseline function. - dtype : [type] + dtype : type or numpy.dtype, optional The desired output dtype for the baseline. sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need re-ordering. Default is (). - axis : int, optional - The axis of the input which defines each unique set of data. Default is -1. skip_sorting : bool, optional If True, will skip sorting the output baseline. The keys in `sort_keys` will still be sorted. Default is False. @@ -175,7 +177,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s # assumes params all all just one dimensional arrays params[key] = params[key][self._inverted_order] if not skip_sorting: - baseline = _sort_array(baseline, sort_order=self._inverted_order, axis=axis) + baseline = _sort_array(baseline, sort_order=self._inverted_order) baseline = baseline.astype(dtype, copy=False) @@ -183,7 +185,7 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), axis=-1, skip_s @classmethod def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d=True, - axis=-1, skip_sorting=False): + skip_sorting=False): """ Wraps a baseline function to validate inputs and correct outputs. @@ -206,11 +208,9 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d ensure_1d : bool, optional If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). - axis : int, optional - The axis of the input on which to check its length. Default is -1. skip_sorting : bool, optional - If True, will skip sorting the inputs and outputs, which is useful for algorithms that use - other algorithms so that sorting is already internally done. Default is False. + If True, will skip sorting the inputs and outputs, which is useful for algorithms that + use other algorithms so that sorting is already internally done. Default is False. Returns ------- @@ -223,7 +223,7 @@ def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_1d if func is None: return partial( cls._register, sort_keys=sort_keys, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis, skip_sorting=skip_sorting + ensure_1d=ensure_1d, skip_sorting=skip_sorting ) @wraps(func) @@ -235,16 +235,16 @@ def inner(self, data=None, *args, **kwargs): input_y = True y, self.x = _yx_arrays( data, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis + ensure_1d=ensure_1d ) - self._len = y.shape[axis] + self._len = y.shape[-1] else: reset_x = True if data is not None: input_y = True y = _check_sized_array( data, self._len, check_finite=self._check_finite, dtype=dtype, order=order, - ensure_1d=ensure_1d, axis=axis, name='data' + ensure_1d=ensure_1d, name='data' ) else: y = data @@ -256,7 +256,7 @@ def inner(self, data=None, *args, **kwargs): ) if input_y and not skip_sorting: - y = _sort_array(y, sort_order=self._sort_order, axis=axis) + y = _sort_array(y, sort_order=self._sort_order) if input_y and self._dtype is None: output_dtype = y.dtype @@ -267,9 +267,7 @@ def inner(self, data=None, *args, **kwargs): if reset_x: self.x = np.array(self.x, dtype=x_dtype, copy=False) - return self._return_results( - baseline, params, output_dtype, sort_keys, axis, skip_sorting - ) + return self._return_results(baseline, params, output_dtype, sort_keys, skip_sorting) return inner @@ -341,7 +339,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. lam : float, optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method @@ -415,7 +413,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -458,6 +456,9 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, ) if self._sort_order is not None and weights is not None: weight_array = weight_array[self._sort_order] + poly_order = _check_scalar_variable( + poly_order, allow_zero=True, variable_name='polynomial order', dtype=int + ) if calc_vander: if self.vandermonde is None or poly_order > self.poly_order: @@ -491,7 +492,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -574,7 +575,7 @@ def _setup_morphology(self, y, half_window=None, **window_kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. half_window : int, optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the @@ -629,7 +630,7 @@ def _setup_smooth(self, y, half_window=0, allow_zero=True, **pad_kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. half_window : int, optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge @@ -658,7 +659,7 @@ def _setup_classification(self, y, weights=None): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -745,7 +746,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. method : str The string name of the desired function, like 'asls'. Case does not matter. modules : Sequence(module, ...) @@ -797,7 +798,7 @@ def _setup_misc(self, y): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`._register`. + array by :meth:`~_Algorithm._register`. Returns ------- @@ -813,47 +814,6 @@ def _setup_misc(self, y): return y -def _sort_array(array, sort_order=None, axis=-1): - """ - Sorts the input array only if given a non-None sorting order. - - Parameters - ---------- - array : numpy.ndarray - The array to sort. - sort_order : numpy.ndarray, optional - The array defining the sort order for the input array. Default is None, which - will not sort the input. - axis : int, optional - The axis of the input which defines each unique set of data. Default is -1. - - Returns - ------- - output : numpy.ndarray - The input array after optionally sorting. - - Raises - ------ - ValueError - Raised if the input array has more than two dimensions. - - """ - if sort_order is None: - output = array - else: - n_dims = array.ndim - if n_dims == 1: - output = array[sort_order] - elif n_dims == 2: - axes = [..., ...] - axes[axis] = sort_order - output = array[tuple(axes)] - else: - raise ValueError('too many dimensions to sort the data') - - return output - - def _class_wrapper(klass): """ Wraps a function to call the corresponding class method instead. diff --git a/pybaselines/_banded_utils.py b/pybaselines/_banded_utils.py index d34f7df..f97d76b 100644 --- a/pybaselines/_banded_utils.py +++ b/pybaselines/_banded_utils.py @@ -8,9 +8,8 @@ import numpy as np from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import identity, diags -from ._compat import _HAS_PENTAPY, _pentapy_solve +from ._compat import _HAS_PENTAPY, _pentapy_solve, identity, diags, dia_object from ._validation import _check_lam @@ -205,7 +204,7 @@ def difference_matrix(data_size, diff_order=2, diff_format=None): Returns ------- - diff_matrix : scipy.sparse.base.spmatrix + diff_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray The sparse difference matrix. Raises @@ -484,6 +483,54 @@ def diff_penalty_diagonals(data_size, diff_order=2, lower_only=True, padding=0): return diagonals +def diff_penalty_matrix(data_size, diff_order=2, diff_format='csr'): + """ + Creates the finite difference penalty matrix. + + If `D` is the finite difference matrix, then the finite difference penalty + matrix is defined as ``D.T @ D``. + + Parameters + ---------- + data_size : int + The number of data points. + diff_order : int, optional + The integer differential order; must be >= 0. Default is 2. + diff_format : str or None, optional + The sparse format to use for the difference matrix. Default is 'csr'. + + Returns + ------- + penalty_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray + The sparse difference penalty matrix. + + Raises + ------ + ValueError + Raised if `diff_order` is greater or equal to `data_size`. + + Notes + ----- + Equivalent to calling:: + + from pybaselines.utils import difference_matrix + diff_matrix = difference_matrix(data_size, diff_order) + penalty_matrix = diff_matrix.T @ diff_matrix + + but should be faster since the bands within the penalty matrix can be gotten + without the matrix multiplication. + + """ + if data_size <= diff_order: + raise ValueError('data size must be greater than or equal to the difference order.') + penalty_bands = diff_penalty_diagonals(data_size, diff_order, lower_only=False) + penalty_matrix = dia_object( + (penalty_bands, np.arange(diff_order, -diff_order - 1, -1)), shape=(data_size, data_size), + ).asformat(diff_format) + + return penalty_matrix + + def _pentapy_solver(ab, y, check_output=False, pentapy_solver=2): """ Convenience function for calling pentapy's solver with defaults already set. @@ -549,11 +596,12 @@ class PenalizedSystem: Maintained so that repeated computations with different `lam` values can be quickly set up. `original_diagonals` can be either the full or lower bands of the penalty, and may be reveresed, it depends on the set up. Reset by calling - :meth:`.reset_diagonals`. + :meth:`~PenalizedSystem.reset_diagonals`. penalty : numpy.ndarray The current penalty. Originally is `original_diagonals` after multiplying by `lam` - and applying padding, but can also be changed by calling :meth:`.add_penalty`. - Reset by calling :meth:`.reset_diagonals`. + and applying padding, but can also be changed by calling + :meth:`~PenalizedSystem.add_penalty`. + Reset by calling :meth:`~PenalizedSystem.reset_diagonals`. pentapy_solver : int or str The integer or string designating which solver to use if using pentapy. See :func:`pentapy.solve` for available options, although `1` or `2` are the diff --git a/pybaselines/_compat.py b/pybaselines/_compat.py index cba7363..4243dda 100644 --- a/pybaselines/_compat.py +++ b/pybaselines/_compat.py @@ -6,9 +6,10 @@ """ -from functools import wraps +from functools import lru_cache, wraps -from scipy import integrate +import scipy +from scipy import integrate, sparse try: @@ -52,8 +53,150 @@ def wrapper(*args, **kwargs): # scipy.integrate.trapezoid was introduced in v1.6.0, while # scipy.integrate.trapz will be deprecated in v1.14.0. # Use scipy instead of numpy since numpy.trapz will be deprecated -# in v2.0.0 +# in v2.0.0 -> the deprecation was stopped (delayed?), but rely +# on scipy since there is no potential deprecation there if hasattr(integrate, 'trapezoid'): trapezoid = integrate.trapezoid else: trapezoid = integrate.trapz + + +@lru_cache(maxsize=1) +def _use_sparse_arrays(): + """ + Checks that the installed scipy version is new enough to use sparse arrays. + + This check is wrapped into a function just in case it fails so that pybaselines + can still be imported without error. The result is cached so it only has to + be done once. + + Returns + ------- + bool + True if the installed scipy version is above 1.12; False otherwise. + + Notes + ----- + Scipy introduced its sparse arrays in version 1.8, but the interface and helper + functions were not stable until version 1.12; a warning will be emitted in scipy + 1.13 when using the matrix interface, so want to use the sparse array interface + as early as possible. + + """ + try: + _scipy_version = [int(val) for val in scipy.__version__.lstrip('v').split('.')[:2]] + except Exception: + # in case in the far future scipy stops using semantic versioning; probably + # bigger problems than this check at that point so just return True + return True + + return _scipy_version[0] > 1 or (_scipy_version[0] == 1 and _scipy_version[1] >= 12) + + +def dia_object(*args, **kwargs): + """ + Handles creation of a sparse diagonal object. + + Parameters + ---------- + *args + Any arguments to pass to the creation functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.dia_matrix or scipy.sparse.dia_array + A sparse diagonal matrix if the intalled scipy version is older than 1.12, + otherwise a sparse diagonal array. + + """ + if _use_sparse_arrays(): + return sparse.dia_array(*args, **kwargs) + else: + return sparse.dia_matrix(*args, **kwargs) + + +def csr_object(*args, **kwargs): + """ + Handles creation of a sparse csr object. + + Parameters + ---------- + *args + Any arguments to pass to the creation functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.csr_matrix or scipy.sparse.csr_array + A sparse csr matrix if the intalled scipy version is older than 1.12, + otherwise a sparse csr array. + + """ + if _use_sparse_arrays(): + return sparse.csr_array(*args, **kwargs) + else: + return sparse.csr_matrix(*args, **kwargs) + + +def identity(size, format=None, **kwargs): + """ + Handles creation of a sparse square identity matrix. + + Parameters + ---------- + size : int + The length of the rows and columns of the sparse matrix. + format : str, optional + The sparse format to use for the identiy matrix. Default is None, which + will use the default of the underlying functions. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The sparse identity matrix. + + Notes + ----- + This function will need to be updated in the future to prefer sparse.identity again + once the sparse matrices are removed. + + """ + if _use_sparse_arrays(): + return sparse.eye_array(size, size, format=format, **kwargs) + else: + return sparse.identity(size, format=format, **kwargs) + + +def diags(data, offsets=0, **kwargs): + """ + Handles creation of a sparse diagonal matrix. + + Parameters + ---------- + data : array-like + The data to be put in the diagonals. + offsets : int or Sequence[int], optional + The offsets for `data`. Default is 0, which is the main diagonal. + **kwargs + Additional keyword arguments to pass to the creation functions. + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The sparse identiy matrix. + + Notes + ----- + This function will need to be updated in the future to prefer sparse.diags again + once the sparse matrices are removed. + + """ + if _use_sparse_arrays(): + return sparse.diags_array(data, offsets=offsets, **kwargs) + else: + return sparse.diags(data, offsets=offsets, **kwargs) diff --git a/pybaselines/_spline_utils.py b/pybaselines/_spline_utils.py index 32a2efb..a83fba7 100644 --- a/pybaselines/_spline_utils.py +++ b/pybaselines/_spline_utils.py @@ -45,10 +45,9 @@ import numpy as np from scipy.interpolate import BSpline, splev from scipy.linalg import solve_banded, solveh_banded -from scipy.sparse import csc_matrix, csr_matrix, spdiags from ._banded_utils import _add_diagonals, _lower_to_full, PenalizedSystem -from ._compat import _HAS_NUMBA, jit +from ._compat import _HAS_NUMBA, csr_object, dia_object, jit from ._validation import _check_array @@ -224,7 +223,7 @@ def _make_design_matrix(x, knots, spline_degree): """ data, row_ind, col_ind = __make_design_matrix(x, knots, spline_degree) - return csr_matrix((data, (row_ind, col_ind)), (len(x), len(knots) - spline_degree - 1)) + return csr_object((data, (row_ind, col_ind)), (len(x), len(knots) - spline_degree - 1)) def _slow_design_matrix(x, knots, spline_degree): @@ -273,7 +272,7 @@ def _slow_design_matrix(x, knots, spline_degree): basis[spline_degree, 0] = small_float basis[-(spline_degree + 1), -1] = small_float - return csc_matrix(basis).T + return csr_object(basis.T) def _spline_knots(x, num_knots=10, spline_degree=3, penalized=True): @@ -575,7 +574,7 @@ def _solve_pspline(x, y, weights, basis, penalty, knots, spline_degree, rhs_extr # worst case scenario; have to convert weights to a sparse diagonal matrix, # do B.T @ W @ B, and convert back to lower banded len_y = len(y) - full_matrix = basis.T @ spdiags(weights, 0, len_y, len_y, 'csr') @ basis + full_matrix = basis.T @ dia_object((weights, 0), shape=(len_y, len_y)).tocsr() @ basis rhs = basis.T @ (weights * y) ab = full_matrix.todia().data[::-1] # take only the lower diagonals of the symmetric ab; cannot just do @@ -646,7 +645,7 @@ class PSpline(PenalizedSystem): in `x`, and `M` is the number of basis functions (equal to ``K - spline_degree - 1`` or equivalently ``num_knots + spline_degree - 1``). coef : None or numpy.ndarray, shape (M,) - The spline coefficients. Is None if :meth:`.solve_pspline` has not been called + The spline coefficients. Is None if :meth:`~PSpline.solve_pspline` has not been called at least once. knots : numpy.ndarray, shape (K,) The knots for the spline. Has a shape of `K`, which is equal to @@ -887,8 +886,11 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): # worst case scenario; have to convert weights to a sparse diagonal matrix, # do B.T @ W @ B, and convert back to lower banded full_matrix = ( - self.basis.T @ spdiags(weights, 0, self._x_len, self._x_len, 'csr') @ self.basis + self.basis.T + @ dia_object((weights, 0), shape=(self._x_len, self._x_len)).tocsr() + @ self.basis ) + rhs = self.basis.T @ (weights * y) ab = full_matrix.todia().data[::-1] # take only the lower diagonals of the symmetric ab; cannot just do diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index bc45180..0ecb89e 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -71,25 +71,29 @@ def _check_scalar(data, desired_length, fill_scalar=False, coerce_0d=True, **asa return output, is_scalar -def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarray_kwargs): +def _check_scalar_variable(value, allow_zero=False, variable_name='lam', two_d=False, + **asarray_kwargs): """ Ensures the input is a scalar value. Parameters ---------- - value : float or array-like + value : numpy.Number or array-like The value to check. allow_zero : bool, optional If False (default), only allows `value` > 0. If True, allows `value` >= 0. variable_name : str, optional The name displayed if an error occurs. Default is 'lam'. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. **asarray_kwargs : dict Additional keyword arguments to pass to :func:`numpy.asarray`. Returns ------- - output : float - The verified scalar value. + output : numpy.Number or numpy.ndarray[numpy.Number, numpy.Number] + The verified scalar value(s). Raises ------ @@ -98,7 +102,13 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr less than 0 if `allow_zero` is True. """ - output = _check_scalar(value, 1, fill_scalar=False, **asarray_kwargs)[0] + if two_d: + desired_length = 2 + fill_scalar = True + else: + desired_length = 1 + fill_scalar = False + output = _check_scalar(value, desired_length, fill_scalar=fill_scalar, **asarray_kwargs)[0] if allow_zero: operation = np.less text = 'greater than or equal to' @@ -108,11 +118,11 @@ def _check_scalar_variable(value, allow_zero=False, variable_name='lam', **asarr if np.any(operation(output, 0)): raise ValueError(f'{variable_name} must be {text} 0') - # use an empty tuple to get the single scalar value return output -def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True): +def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=True, + ensure_2d=False, two_d=False): """ Validates the shape and values of the input array and controls the output parameters. @@ -131,6 +141,12 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr ensure_1d : bool, optional If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + ensure_2d : bool, optional + If True, will raise an error if `array` is not a two dimensional array or a three + dimensional array with shape (M, N, 1), (1, M, N), or (M, 1, N). Default is False. + two_d : bool, optional + If True, will raise an error if the shape of `array` is not a two dimensional array with + shape (M, N) where M or N must be greater than 1. Returns ------- @@ -146,7 +162,8 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr Notes ----- If `ensure_1d` is True and `array` has a shape of (N, 1) or (1, N), it is reshaped to - (N,) for better compatibility for all functions. + (N,) for better compatibility for all functions. Likewise, `ensure_2d` will flatten to + (M, N). """ if check_finite: @@ -161,12 +178,29 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr output = output.reshape(-1) elif dimensions != 1: raise ValueError('must be a one dimensional array') + elif two_d: + output = np.array(output, copy=False, ndmin=2) + dimensions = output.ndim + if dimensions == 2 and 1 in output.shape: + raise ValueError( + 'input data must be a two dimensional array with more than just one row or column' + ) + if ensure_2d: + if dimensions == 3 and 1 in output.shape: + output_shape = np.array(output.shape) + flat_dims = ~np.equal(output_shape, 1) + output = output.reshape(output_shape[flat_dims]).shape + elif dimensions != 2: + raise ValueError('must be a two dimensional array') + elif ensure_2d and not two_d: + raise ValueError('two_d must be True if using ensure_2d') return output def _check_sized_array(array, length, dtype=None, order=None, check_finite=False, - ensure_1d=True, axis=-1, name='weights'): + ensure_1d=True, axis=-1, name='weights', ensure_2d=False, + two_d=False): """ Validates the input array and ensures its length is correct. @@ -204,9 +238,10 @@ def _check_sized_array(array, length, dtype=None, order=None, check_finite=False """ output = _check_array( - array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d + array, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=ensure_1d, + ensure_2d=ensure_2d, two_d=two_d ) - if output.shape[axis] != length: + if not np.equal(output.shape[axis], length).all(): raise ValueError( f'length mismatch for {name}; expected {length} but got {output.shape[axis]}' ) @@ -267,7 +302,73 @@ def _yx_arrays(data, x_data=None, check_finite=False, dtype=None, order=None, en return y, x -def _check_lam(lam, allow_zero=False): +def _yxz_arrays(data, x_data=None, z_data=None, check_finite=False, dtype=None, order=None, + ensure_2d=True, x_axis=-2, z_axis=-1): + """ + Converts input data into numpy arrays and provides x and z data if none are given. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data, with N data points. + x_data : array-like, shape (M,), optional + The x-values of the measured data. Default is None, which will create an + array from -1. to 1. with N points. + z_data : array-like, shape (N,), optional + The z-values of the measured data. Default is None, which will create an + array from -1. to 1. with N points. + check_finite : bool, optional + If True, will raise an error if any values if `array` are not finite. Default is False, + which skips the check. + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). + + Returns + ------- + y : numpy.ndarray, shape (M, N) + A numpy array of the y-values of the measured data. + x : numpy.ndarray, shape (M,) + A numpy array of the x-values of the measured data, or a created array. + z : numpy.ndarray, shape (N,) + A numpy array of the z-values of the measured data, or a created array. + + Notes + ----- + Does not change the scale/domain of the input `x_data` or `z_data` if they + are given, only converts them to arrays. + + """ + y = _check_array( + data, dtype=dtype, order=order, check_finite=check_finite, ensure_1d=False, + ensure_2d=ensure_2d, two_d=True + ) + x_len = y.shape[x_axis] + z_len = y.shape[z_axis] + if x_data is None: + x = np.linspace(-1, 1, x_len) + else: + x = _check_sized_array( + x_data, x_len, dtype=dtype, order=order, check_finite=check_finite, + ensure_1d=True, axis=0, name='x_data' + ) + if z_data is None: + z = np.linspace(-1, 1, z_len) + else: + z = _check_sized_array( + z_data, z_len, dtype=dtype, order=order, check_finite=check_finite, + ensure_1d=True, axis=0, name='z_data' + ) + + return y, x, z + + +def _check_lam(lam, allow_zero=False, two_d=False, dtype=float): """ Ensures the regularization parameter `lam` is a scalar greater than 0. @@ -278,11 +379,16 @@ def _check_lam(lam, allow_zero=False): penalized splines. allow_zero : bool If False (default), only allows `lam` values > 0. If True, allows `lam` >= 0. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. + dtype : type or numpy.dtype, optional + The dtype to cast the lam value. Default is float. Returns ------- - float - The scalar `lam` value. + numpy.Number or numpy.ndarray[numpy.Number, numpy.Number] + The verified `lam` value(s). Raises ------ @@ -309,10 +415,10 @@ def _check_lam(lam, allow_zero=False): ``(diags(lam) @ D.T @ D).todia().data[::-1]``. """ - return _check_scalar_variable(lam, allow_zero) + return _check_scalar_variable(lam, allow_zero, two_d=two_d, variable_name='lam', dtype=dtype) -def _check_half_window(half_window, allow_zero=False): +def _check_half_window(half_window, allow_zero=False, two_d=False): """ Ensures the half-window is an integer and has an appropriate value. @@ -325,11 +431,14 @@ def _check_half_window(half_window, allow_zero=False): allow_zero : bool, optional If True, allows `half_window` to be 0; otherwise, `half_window` must be at least 1. Default is False. + two_d : bool, optional + If True, will output an array with two values. If False (default), will + return a single scalar value. Returns ------- - output_half_window : int - The verified half-window value. + output_half_window : int or numpy.ndarray[int, int] + The verified half-window value(s). Raises ------ @@ -339,23 +448,23 @@ def _check_half_window(half_window, allow_zero=False): """ output_half_window = _check_scalar_variable( - half_window, allow_zero, 'half_window', dtype=np.intp + half_window, allow_zero, variable_name='half_window', two_d=two_d, dtype=np.intp ) - if output_half_window != half_window: + if not two_d and output_half_window != half_window: raise TypeError('half_window must be an integer') return output_half_window def _check_optional_array(data_size, array=None, dtype=None, order=None, check_finite=False, - copy_input=False, name='weights'): + copy_input=False, name='weights', ensure_1d=True, axis=-1): """ Validates the length of the input array or creates an array of ones if no input is given. Parameters ---------- - data_size : int - The length that the input should have. + data_size : int or Container[int, int] + The shape that the input should have. array : array-like, shape (`data_size`), optional The array to validate. Default is None, which will create an array of ones with length equal to `data_size`. @@ -371,6 +480,12 @@ def _check_optional_array(data_size, array=None, dtype=None, order=None, check_f which skips the check. name : str, optional The name for the variable if an exception is raised. Default is 'weights'. + ensure_1d : bool, optional + If True (default), will raise an error if the shape of `array` is not a one dimensional + array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). If False, + will ignore the shape of `array`. + axis : int, optional + The axis of the input on which to check its length. Default is -1. Returns ------- @@ -383,9 +498,44 @@ def _check_optional_array(data_size, array=None, dtype=None, order=None, check_f else: output_array = _check_sized_array( array, data_size, dtype=dtype, order=order, check_finite=check_finite, - ensure_1d=True, name=name + ensure_1d=ensure_1d, name=name, axis=axis ) if copy_input: output_array = output_array.copy() return output_array + + +def _get_row_col_values(value, **asarray_kwargs): + """ + Determines the row and column values for an input that can be scalar or up to length 4. + + Parameters + ---------- + value : numpy.number or Sequence[numpy.number, ...] + The value(s) corresponding to the first row, last row, first column, and last + column. + + Returns + ------- + output : numpy.ndarray, shape (4,) + The array of length 4 with values first row, last row, first column, last column. + + Raises + ------ + ValueError + Raised if the input value was a sequence with 1, 2, or 4 values. + + """ + # can either be len 1, 2, or 4 + output, scalar_input = _check_scalar(value, None, **asarray_kwargs) + if scalar_input: + output = np.full(4, output) + else: + len_input = len(output) + if len_input not in (2, 4): + raise ValueError('input must either be a single value or an array with length 2 or 4') + elif len_input == 2: + output = np.array([output[0], output[0], output[1], output[1]]) + + return output diff --git a/pybaselines/api.py b/pybaselines/api.py index 025916f..1c5cdea 100644 --- a/pybaselines/api.py +++ b/pybaselines/api.py @@ -61,3 +61,32 @@ class Baseline( set to numpy.ndarray([-1, 1]). """ + + def _get_method(self, method): + """ + A helper function to allow accessing methods by their string. + + Parameters + ---------- + method : str + The name of the desired method as a string. Capitalization is ignored. For + example, both 'asls' and 'AsLS' would return :meth:`~.Baseline.asls`. + + Returns + ------- + output : Callable + The callable method corresponding to the input string. + + Raises + ------ + AttributeError + Raised if the input method does not exist. + + """ + method_string = method.lower() + if hasattr(self, method_string): + output = getattr(self, method_string) + else: + raise AttributeError(f'unknown method "{method}"') + + return output diff --git a/pybaselines/classification.py b/pybaselines/classification.py index af04ce6..0e12312 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -225,7 +225,7 @@ def dietrich(self, data, smooth_half_window=None, num_std=3.0, interp_half_windo * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True and `max_iter` is greater than 0. The array of polynomial coefficients for the baseline, in increasing order. Can be - used to create a polynomial using numpy.polynomial.polynomial.Polynomial(). + used to create a polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. * 'tol_history': numpy.ndarray Only if `max_iter` is greater than 1. An array containing the calculated tolerance values for each iteration. The length of the array is the number @@ -764,7 +764,7 @@ def fabc(self, data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length= Notes ----- - The classification of baseline points is similar to :meth:`.dietrich`, except that + The classification of baseline points is similar to :meth:`~Baseline.dietrich`, except that this method approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust than the numerical derivative in Dietrich's method. @@ -1028,9 +1028,9 @@ def _averaged_interp(x, y, mask, interp_half_window=0): mask_sum = mask.sum() if not mask_sum: # all points belong to peaks # will just interpolate between first and last points - warnings.warn('there were no baseline points found', ParameterWarning) + warnings.warn('there were no baseline points found', ParameterWarning, stacklevel=2) elif mask_sum == mask.shape[0]: # all points belong to baseline - warnings.warn('there were no peak points found', ParameterWarning) + warnings.warn('there were no peak points found', ParameterWarning, stacklevel=2) return output peak_starts, peak_ends = _find_peak_segments(mask) @@ -1153,7 +1153,7 @@ def _iter_threshold(power, num_std=3.0): if masked_power.size < 2: # need at least 2 points for std calculation warnings.warn( 'not enough baseline points found; "num_std" is likely too low', - ParameterWarning + ParameterWarning, stacklevel=2 ) break mask = power < np.mean(masked_power) + num_std * np.std(masked_power, ddof=1) @@ -1230,7 +1230,7 @@ def dietrich(data, x_data=None, smooth_half_window=None, num_std=3.0, * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True and `max_iter` is greater than 0. The array of polynomial coefficients for the baseline, in increasing order. Can be - used to create a polynomial using numpy.polynomial.polynomial.Polynomial(). + used to create a polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. * 'tol_history': numpy.ndarray Only if `max_iter` is greater than 1. An array containing the calculated tolerance values for each iteration. The length of the array is the number @@ -1809,7 +1809,7 @@ def fabc(data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length=2, wei Notes ----- - The classification of baseline points is similar to :meth:`.dietrich`, except that + The classification of baseline points is similar to :meth:`~Baseline.dietrich`, except that this method approximates the first derivative using a continous wavelet transform with the Haar wavelet, which is more robust than the numerical derivative in Dietrich's method. diff --git a/pybaselines/misc.py b/pybaselines/misc.py index f8faeb9..5094cf6 100644 --- a/pybaselines/misc.py +++ b/pybaselines/misc.py @@ -68,11 +68,10 @@ from scipy.interpolate import interp1d from scipy.linalg import get_blas_funcs, solve_banded, solveh_banded from scipy.ndimage import uniform_filter1d -from scipy.sparse import spdiags from scipy.sparse.linalg import splu, spsolve from ._algorithm_setup import _Algorithm, _class_wrapper -from ._compat import _HAS_NUMBA, jit +from ._compat import _HAS_NUMBA, dia_object, jit from ._validation import _check_array, _check_lam from .utils import _MIN_FLOAT, relative_difference @@ -640,8 +639,8 @@ def _high_pass_filter(data_size, freq_cutoff=0.005, filter_type=1, full_matrix=F b_diags = np.repeat(b.reshape(1, -1), data_size, axis=0).T if full_matrix: offsets = np.arange(-filter_type, filter_type + 1) - A = spdiags(a_diags, offsets, data_size, data_size, 'csr') - B = spdiags(b_diags, offsets, data_size, data_size, 'csr') + A = dia_object((a_diags, offsets), shape=(data_size, data_size)).tocsr() + B = dia_object((b_diags, offsets), shape=(data_size, data_size)).tocsr() else: # add zeros on edges to create the actual banded structure; # creates same structure as diags(a[b]_diags, offsets).todia().data[::-1] @@ -915,7 +914,7 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet # factorize A since A is unchanged in the function and its factorization # is used repeatedly; much faster than calling spsolve each time A_factor = splu(A.tocsc(), permc_spec='NATURAL') - BTB = B * B + BTB = B @ B x = y d1_x, d2_x = _abs_diff(x, smooth_half_window) @@ -929,7 +928,7 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): # calculate line 6 of Table 3 in beads paper using banded matrices rather - # than sparse matrices since it is much faster; Gamma + D.T * Lambda * D + # than sparse matrices since it is much faster; Gamma + D.T @ Lambda @ D # row 1 and 3 instead of 0 and 2 to account for zeros on top and bottom d1_diags[1][1:] = d1_diags[3][:-1] = -_beads_weighting(d1_x, use_v2_loss, eps_1) @@ -945,12 +944,14 @@ def _sparse_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet gamma[big_x] = gamma_factor / abs_x[big_x] d_diags[2] += gamma + # TODO check that 'NATURAL' is the appropriate permutation scheme for this x = A.dot( spsolve( - BTB + A.dot(spdiags(d_diags, offsets, num_y, num_y, 'csr').dot(A)), + BTB + A.dot(dia_object((d_diags, offsets), shape=(num_y, num_y)).tocsr()).dot(A), d, 'NATURAL' ) ) + h = B.dot(A_factor.solve(y - x)) d1_x, d2_x = _abs_diff(x, smooth_half_window) abs_x, big_x, theta = _beads_theta(x, asymmetry, eps_0) @@ -1063,11 +1064,11 @@ def _banded_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet A, B = _high_pass_filter(num_y, freq_cutoff, filter_type, False) # the number of lower and upper diagonals for both A and B ab_lu = (filter_type, filter_type) - # the shape of A and B, and D.T*D matrices in their full forms rather than banded forms + # the shape of A and B, and D.T @ D matrices in their full forms rather than banded forms full_shape = (num_y, num_y) A_lower = A[filter_type:] BTB = _banded_dot_banded(B, B, ab_lu, ab_lu, full_shape, full_shape, True) - # number of lower and upper diagonals of A.T * (D.T * D) * A + # number of lower and upper diagonals of A.T @ (D.T @ D) @ A num_diags = (2 * filter_type + 2, 2 * filter_type + 2) # line 2 of Table 3 in beads paper @@ -1091,7 +1092,7 @@ def _banded_beads(y, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmet tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): # calculate line 6 of Table 3 in beads paper using banded matrices rather - # than sparse matrices since it is much faster; Gamma + D.T * Lambda * D + # than sparse matrices since it is much faster; Gamma + D.T @ Lambda @ D # row 1 and 3 instead of 0 and 2 to account for zeros on top and bottom d1_diags[1][1:] = d1_diags[3][:-1] = -_beads_weighting(d1_x, use_v2_loss, eps_1) @@ -1163,7 +1164,7 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. Decomposes the input data into baseline and pure, noise-free signal by modeling the baseline as a low pass filter and by considering the signal and its derivatives - as sparse [1]_. + as sparse [4]_. Parameters ---------- @@ -1207,14 +1208,14 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. derivatives are close to zero. Default is 1e-6. fit_parabola : bool, optional If True (default), will fit a parabola to the data and subtract it before - performing the beads fit as suggested in [2]_. This ensures the endpoints of + performing the beads fit as suggested in [5]_. This ensures the endpoints of the fit data are close to 0, which is required by beads. If the data is already close to 0 on both endpoints, set `fit_parabola` to False. smooth_half_window : int, optional The half-window to use for smoothing the derivatives of the data with a moving average and full window size of `2 * smooth_half_window + 1`. Smoothing can improve the convergence of the calculation, and make the calculation less sensitive - to small changes in `lam_1` and `lam_2`, as noted in the pybeads package [3]_. + to small changes in `lam_1` and `lam_2`, as noted in the pybeads package [6]_. Default is None, which will not perform any smoothing. x_data : array-like, optional The x-values. Not used by this function, but input is allowed for consistency @@ -1243,7 +1244,7 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. When finding the best parameters for fitting, it is usually best to find the optimal `freq_cutoff` for the noise in the data before adjusting any other parameters since - it has the largest effect [2]_. + it has the largest effect [5]_. Raises ------ @@ -1252,10 +1253,10 @@ def beads(data, freq_cutoff=0.005, lam_0=1.0, lam_1=1.0, lam_2=1.0, asymmetry=6. References ---------- - .. [1] Ning, X., et al. Chromatogram baseline estimation and denoising using sparsity + .. [4] Ning, X., et al. Chromatogram baseline estimation and denoising using sparsity (BEADS). Chemometrics and Intelligent Laboratory Systems, 2014, 139, 156-167. - .. [2] Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms + .. [5] Navarro-Huerta, J.A., et al. Assisted baseline subtraction in complex chromatograms using the BEADS algorithm. Journal of Chromatography A, 2017, 1507, 1-10. - .. [3] https://github.com/skotaro/pybeads. + .. [6] https://github.com/skotaro/pybeads. """ diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index fc2a6ab..baea0a9 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -9,10 +9,10 @@ import numpy as np from scipy.ndimage import grey_closing, grey_dilation, grey_erosion, grey_opening, uniform_filter1d -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._validation import _check_lam from .utils import ( - _mollifier_kernel, pad_edges, padded_convolve, relative_difference + _mollifier_kernel, _sort_array, pad_edges, padded_convolve, relative_difference ) @@ -110,13 +110,13 @@ def mpls(self, data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=1e-3, m indices = np.flatnonzero( ((diff[1:] == 0) | (diff[:-1] == 0)) & ((diff[1:] != 0) | (diff[:-1] != 0)) ) - w = np.full(y.shape[0], p) + w = np.full(self._len, p) # find the index of min(y) in the region between flat regions for previous_segment, next_segment in zip(indices[1::2], indices[2::2]): index = np.argmin(y[previous_segment:next_segment + 1]) + previous_segment w[index] = 1 - p - # have to invert the weight ordering the matching the original input y ordering + # have to invert the weight ordering to match the original input y ordering # since it will be sorted within _setup_whittaker w = _sort_array(w, self._inverted_order) @@ -486,15 +486,8 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, dict A dictionary with the following items: - * 'half_window': int or numpy.ndarray(int) - The half window or array of half windows used for the - morphological calculations. - - Notes - ----- - To use a changing window size for either the morphological or smoothing - operations, the half windows must be arrays. Otherwise, the size of the - rolling ball is assumed to be constant. + * 'half_window': int + The half window used for the morphological calculations. References ---------- @@ -830,7 +823,7 @@ def jbcd(self, data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult= robust_opening : bool, optional If True (default), the opening used to represent the initial baseline is the element-wise minimum between the morphological opening and the average of the - morphological erosion and dilation of the opening, similar to :meth:`.mor`. If + morphological erosion and dilation of the opening, similar to :meth:`~Baseline.mor`. If False, the opening is just the morphological opening, as used in the reference. The robust opening typically represents the baseline better. **window_kwargs @@ -1336,15 +1329,8 @@ def rolling_ball(data, half_window=None, smooth_half_window=None, pad_kwargs=Non dict A dictionary with the following items: - * 'half_window': int or numpy.ndarray(int) - The half window or array of half windows used for the - morphological calculations. - - Notes - ----- - To use a changing window size for either the morphological or smoothing - operations, the half windows must be arrays. Otherwise, the size of the - rolling ball is assumed to be constant. + * 'half_window': int + The half window used for the morphological calculations. References ---------- @@ -1626,7 +1612,7 @@ def jbcd(data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult=1.1, g robust_opening : bool, optional If True (default), the opening used to represent the initial baseline is the element-wise minimum between the morphological opening and the average of the - morphological erosion and dilation of the opening, similar to :meth:`.mor`. If + morphological erosion and dilation of the opening, similar to :meth:`~Baseline.mor`. If False, the opening is just the morphological opening, as used in the reference. The robust opening typically represents the baseline better. x_data : array-like, optional diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index d5e3c5c..4afb93e 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -14,9 +14,9 @@ import numpy as np from . import classification, misc, morphological, polynomial, smooth, spline, whittaker -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._validation import _check_optional_array -from .utils import _check_scalar, _get_edges, gaussian +from .utils import _check_scalar, _get_edges, _sort_array, gaussian class _Optimizers(_Algorithm): @@ -58,8 +58,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the - :meth:`~pybaselines.whittaker.Whittaker.aspls` or - :meth:`~pybaselines.spline.Spline.pspline_aspls` methods. + :meth:`~Baseline.aspls` or :meth:`~Baseline.pspline_aspls` methods. Additional items depend on the output of the selected method. Every other key will have a list of values, with each item corresponding to a @@ -400,8 +399,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, Default is 2. method_kwargs : dict, optional Additional keyword arguments to pass to - :meth:`~pybaselines.polynomial.Polynomial.modpoly` or - :meth:`~pybaselines.polynomial.Polynomial.imodpoly`. These include + :meth:`~Baseline.modpoly` or :meth:`~Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns @@ -677,7 +675,7 @@ def collab_pls(data, average_dataset=True, method='asls', method_kwargs=None, x_ * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the - :meth:`.aspls` or :meth:`.pspline_aspls` methods. + :meth:`~Baseline.aspls` or :meth:`~Baseline.pspline_aspls` methods. Additional items depend on the output of the selected method. Every other key will have a list of values, with each item corresponding to a @@ -897,8 +895,8 @@ def adaptive_minmax(data, x_data=None, poly_order=None, method='modpoly', to select the appropriate polynomial orders if `poly_order` is None. Default is 2. method_kwargs : dict, optional - Additional keyword arguments to pass to :meth:`.modpoly` or - :meth:`.imodpoly`. These include `tol`, `max_iter`, `use_original`, + Additional keyword arguments to pass to :meth:`~Baseline.modpoly` or + :meth:`~Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py index eb33bab..81df97d 100644 --- a/pybaselines/polynomial.py +++ b/pybaselines/polynomial.py @@ -120,7 +120,7 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -192,7 +192,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -245,7 +245,7 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, @_Algorithm._register(sort_keys=('weights',)) def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1): + use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1.): """ The improved modofied polynomial (IModPoly) baseline algorithm. @@ -294,7 +294,7 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -427,7 +427,7 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -580,8 +580,8 @@ def loess(self, data, fraction=0.2, total_points=None, poly_order=1, scale=3.0, * 'coef': numpy.ndarray, shape (N, poly_order + 1) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial - using numpy.polynomial.polynomial.Polynomial(). If `delta` is > 0, the - coefficients for any skipped x-value will all be 0. + using :class:`numpy.polynomial.polynomial.Polynomial`. If `delta` is > 0, + the coefficients for any skipped x-value will all be 0. Raises ------ @@ -754,7 +754,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -884,7 +884,7 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -1034,7 +1034,7 @@ def poly(data, x_data=None, poly_order=2, weights=None, return_coef=False): * 'coef': numpy.ndarray, shape (poly_order,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1095,7 +1095,7 @@ def modpoly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=Non * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1170,7 +1170,7 @@ def imodpoly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=No * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Notes ----- @@ -1468,7 +1468,7 @@ def penalized_poly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -2029,8 +2029,8 @@ def loess(data, x_data=None, fraction=0.2, total_points=None, poly_order=1, scal * 'coef': numpy.ndarray, shape (N, poly_order + 1) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a polynomial - using numpy.polynomial.polynomial.Polynomial(). If `delta` is > 0, the - coefficients for any skipped x-value will all be 0. + using :class:`numpy.polynomial.polynomial.Polynomial`. If `delta` is > 0, + the coefficients for any skipped x-value will all be 0. Raises ------ @@ -2124,7 +2124,7 @@ def quant_reg(data, x_data=None, poly_order=2, quantile=0.05, tol=1e-6, max_iter * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ @@ -2230,7 +2230,7 @@ def goldindec(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=N * 'coef': numpy.ndarray, shape (poly_order + 1,) Only if `return_coef` is True. The array of polynomial parameters for the baseline, in increasing order. Can be used to create a - polynomial using numpy.polynomial.polynomial.Polynomial(). + polynomial using :class:`numpy.polynomial.polynomial.Polynomial`. Raises ------ diff --git a/pybaselines/smooth.py b/pybaselines/smooth.py index 51738eb..f7af183 100644 --- a/pybaselines/smooth.py +++ b/pybaselines/smooth.py @@ -178,7 +178,7 @@ def snip(self, data, max_half_window=None, decreasing=False, smooth_half_window= if half_window > (self._len - 1) // 2: warnings.warn( 'max_half_window values greater than (len(data) - 1) / 2 have no effect.', - ParameterWarning + ParameterWarning, stacklevel=2 ) half_windows[i] = (self._len - 1) // 2 diff --git a/pybaselines/spline.py b/pybaselines/spline.py index c24b029..7313f89 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -13,17 +13,16 @@ import numpy as np from scipy.ndimage import grey_opening from scipy.optimize import curve_fit -from scipy.sparse import spdiags from . import _weighting -from ._algorithm_setup import _Algorithm, _class_wrapper, _sort_array +from ._algorithm_setup import _Algorithm, _class_wrapper from ._banded_utils import _add_diagonals, _shift_rows, diff_penalty_diagonals -from ._compat import _HAS_NUMBA, jit, trapezoid +from ._compat import _HAS_NUMBA, dia_object, jit, trapezoid from ._spline_utils import _basis_midpoints from ._validation import _check_lam, _check_optional_array from .utils import ( - _MIN_FLOAT, _mollifier_kernel, ParameterWarning, gaussian, pad_edges, padded_convolve, - relative_difference + _MIN_FLOAT, _mollifier_kernel, _sort_array, ParameterWarning, gaussian, pad_edges, + padded_convolve, relative_difference ) @@ -402,9 +401,6 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di weights : array-like, shape (N,), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. Returns ------- @@ -428,7 +424,7 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di See Also -------- - pybaselines.whittaker.asls + Baseline.asls References ---------- @@ -518,7 +514,7 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, See Also -------- - pybaselines.whittaker.iasls + Baseline.iasls References ---------- @@ -549,7 +545,7 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, d1_penalty = _check_lam(lam_1) * diff_penalty_diagonals(self._len, 1, lower_only=False) d1_penalty = ( self.pspline.basis.T - @ spdiags(d1_penalty, np.array([1, 0, -1]), self._len, self._len, 'csr') + @ dia_object((d1_penalty, np.array([1, 0, -1])), shape=(self._len, self._len)).tocsr() ) partial_rhs = d1_penalty @ y # now change d1_penalty back to banded array @@ -618,7 +614,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, See Also -------- - pybaselines.whittaker.airpls + Baseline.airpls References ---------- @@ -642,7 +638,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -657,7 +653,7 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -722,7 +718,7 @@ def pspline_arpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde See Also -------- - pybaselines.whittaker.arpls + Baseline.arpls References ---------- @@ -805,7 +801,7 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, See Also -------- - pybaselines.whittaker.drpls + Baseline.drpls References ---------- @@ -855,7 +851,8 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -894,9 +891,6 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord weights : array-like, shape (N,), optional The weighting array. If None (default), then the initial weights will be an array with size equal to N and all values set to 1. - x_data : array-like, shape (N,), optional - The x-values of the measured data. Default is None, which will create an - array from -1 to 1 with N points. Returns ------- @@ -915,7 +909,7 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord See Also -------- - pybaselines.whittaker.iarpls + Baseline.iarpls References ---------- @@ -945,7 +939,8 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -978,7 +973,7 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -1008,7 +1003,7 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde See Also -------- - pybaselines.whittaker.aspls + Baseline.aspls Notes ----- @@ -1084,7 +1079,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -1122,7 +1117,7 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg See Also -------- - pybaselines.whittaker.psalsa + Baseline.psalsa References ---------- @@ -1180,7 +1175,7 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -1227,7 +1222,7 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline See Also -------- - pybaselines.whittaker.derpsalsa + Baseline.derpsalsa References ---------- @@ -1352,6 +1347,10 @@ def pspline_mpls(self, data, half_window=None, lam=1e3, p=0.0, num_knots=100, sp ValueError Raised if p is not between 0 and 1. + See Also + -------- + Baseline.mpls + References ---------- .. [32] Li, Zhong, et al. Morphological weighted penalized least squares for @@ -2307,7 +2306,7 @@ def pspline_aspls(data, lam=1e4, num_knots=100, spline_degree=3, diff_order=2, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -2384,7 +2383,7 @@ def pspline_psalsa(data, lam=1e3, p=0.5, k=None, num_knots=100, spline_degree=3, values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional @@ -2463,7 +2462,7 @@ def pspline_derpsalsa(data, lam=1e2, p=1e-2, k=None, num_knots=100, spline_degre values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. num_knots : int, optional The number of knots for the spline. Default is 100. spline_degree : int, optional diff --git a/pybaselines/two_d/__init__.py b/pybaselines/two_d/__init__.py new file mode 100644 index 0000000..d2f0c51 --- /dev/null +++ b/pybaselines/two_d/__init__.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +""" +============================================= +Baseline Correction for Two Dimensional Data. +============================================= + +:mod:`pybaselines.two_d` provides the following algorithms for baseline correcting 2D data. + +* Polynomial methods (:mod:`pybaselines.two_d.polynomial`) + + * poly (Regular Polynomial) + * modpoly (Modified Polynomial) + * imodpoly (Improved Modified Polynomial) + * penalized_poly (Penalized Polynomial) + * quant_reg (Quantile Regression) + +* Whittaker-smoothing-based methods (:mod:`pybaselines.two_d.whittaker`) + + * asls (Asymmetric Least Squares) + * iasls (Improved Asymmetric Least Squares) + * airpls (Adaptive Iteratively Reweighted Penalized Least Squares) + * arpls (Asymmetrically Reweighted Penalized Least Squares) + * drpls (Doubly Reweighted Penalized Least Squares) + * iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) + * aspls (Adaptive Smoothness Penalized Least Squares) + * psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) + +* Morphological methods (:mod:`pybaselines.two_d.morphological`) + + * mor (Morphological) + * imor (Improved Morphological) + * rolling_ball (Rolling Ball Baseline) + * tophat (Top-hat Transformation) + +* Spline methods (:mod:`pybaselines.two_d.spline`) + + * mixture_model (Mixture Model) + * irsqr (Iterative Reweighted Spline Quantile Regression) + * pspline_asls (Penalized Spline Version of asls) + * pspline_iasls (Penalized Spline Version of iasls) + * pspline_airpls (Penalized Spline Version of airpls) + * pspline_arpls (Penalized Spline Version of arpls) + * pspline_iarpls (Penalized Spline Version of iarpls) + * pspline_psalsa (Penalized Spline Version of psalsa) + +* Smoothing-based methods (:mod:`pybaselines.two_d.smooth`) + + * noise_median (Noise Median method) + +* Optimizers (:mod:`pybaselines.two_d.optimizers`) + + * collab_pls (Collaborative Penalized Least Squares) + * adaptive_minmax (Adaptive MinMax) + + +@author: Donald Erb +Created on January 15, 2024 + +""" + +from .api import Baseline2D diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py new file mode 100644 index 0000000..baa07c5 --- /dev/null +++ b/pybaselines/two_d/_algorithm_setup.py @@ -0,0 +1,927 @@ +# -*- coding: utf-8 -*- +"""Setup code for the various algorithm types in pybaselines. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +from contextlib import contextmanager +from functools import partial, wraps +import itertools +import warnings + +import numpy as np + +from ..utils import ( + ParameterWarning, _determine_sorts, _inverted_sort, _sort_array2d, optimize_window, + pad_edges2d +) +from ._spline_utils import PSpline2D +from .._validation import ( + _check_array, _check_half_window, _check_optional_array, _check_scalar_variable, + _check_sized_array, _yxz_arrays +) +from ._whittaker_utils import WhittakerSystem2D + + +class _Algorithm2D: + """ + A base class for all 2D algorithm types. + + Contains setup methods for all algorithm types to make more complex algorithms + easier to set up. + + Attributes + ---------- + poly_order : Sequence[int, int] + The last polynomial order used for a polynomial algorithm. Initially is -1, denoting + that no polynomial fitting has been performed. + pspline : PSpline2D or None + The PSpline2D object for setting up and solving penalized spline algorithms. Is None + if no penalized spline setup has been performed (typically done in + :meth:`~_Algorithm2D._setup_spline`). + vandermonde : numpy.ndarray or None + The Vandermonde matrix for solving polynomial equations. Is None if no polynomial + setup has been performed (typically done in :meth:`~_Algorithm2D._setup_polynomial`). + whittaker_system : PenalizedSystem2D or None + The PenalizedSystem2D object for setting up and solving Whittaker-smoothing-based + algorithms. Is None if no Whittaker setup has been performed (typically done in + :meth:`_setup_whittaker`). + x : numpy.ndarray or None + The x-values for the object. If initialized with None, then `x` is initialized the + first function call to have the same size as the input `data.shape[-2]` and has min + and max values of -1 and 1, respectively. + x_domain : numpy.ndarray + The minimum and maximum values of `x`. If `x_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + z : numpy.ndarray or None + The z-values for the object. If initialized with None, then `z` is initialized the + first function call to have the same size as the input `data.shape[-1]` and has min + and max values of -1 and 1, respectively. + z_domain : numpy.ndarray + The minimum and maximum values of `z`. If `z_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + + """ + + def __init__(self, x_data=None, z_data=None, check_finite=True, assume_sorted=False, + output_dtype=None): + """ + Initializes the algorithm object. + + Parameters + ---------- + x_data : array-like, shape (M,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + z_data : array-like, shape (N,), optional + The z-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + check_finite : bool, optional + If True (default), will raise an error if any values in input data are not finite. + Setting to False will skip the check. Note that errors may occur if + `check_finite` is False and the input data contains non-finite values. + assume_sorted : bool, optional + If False (default), will sort the input `x_data` and `z_data` values. Otherwise, + the input is assumed to be sorted. Note that some functions may raise an error + if `x_data` and `z_data` are not sorted. + output_dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing + of the input data. + + """ + self._len = [None, None] + x_sort_order = None + z_sort_order = None + if x_data is None: + self.x = None + self.x_domain = np.array([-1., 1.]) + else: + self.x = _check_array(x_data, check_finite=check_finite) + self._len[0] = len(self.x) + self.x_domain = np.polynomial.polyutils.getdomain(self.x) + if not assume_sorted: + x_sort_order, x_inverted_order = _determine_sorts(self.x) + if x_sort_order is not None: + self.x = self.x[x_sort_order] + + if z_data is None: + self.z = None + self.z_domain = np.array([-1., 1.]) + else: + self.z = _check_array(z_data, check_finite=check_finite) + self._len[1] = len(self.z) + self.z_domain = np.polynomial.polyutils.getdomain(self.z) + if not assume_sorted: + z_sort_order, z_inverted_order = _determine_sorts(self.z) + if z_sort_order is not None: + self.z = self.z[z_sort_order] + + if x_sort_order is None and z_sort_order is None: + self._sort_order = None + self._inverted_order = None + elif z_sort_order is None: + self._sort_order = x_sort_order + self._inverted_order = x_inverted_order + elif x_sort_order is None: + self._sort_order = (..., z_sort_order) + self._inverted_order = (..., z_inverted_order) + else: + self._sort_order = (x_sort_order[:, None], z_sort_order[None, :]) + self._inverted_order = (x_inverted_order[:, None], z_inverted_order[None, :]) + + self.whittaker_system = None + self.vandermonde = None + self.poly_order = -1 + self.pspline = None + self._check_finite = check_finite + self._dtype = output_dtype + + def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False, + reshape_baseline=False, reshape_keys=(), skip_sorting=False): + """ + Re-orders the input baseline and parameters based on the x ordering. + + If `self._sort_order` is None, then no reordering is performed. + + Parameters + ---------- + baseline : numpy.ndarray, shape (M, N) + The baseline output by the baseline function. + params : dict + The parameter dictionary output by the baseline function. + dtype : type or numpy.dtype, optional + The desired output dtype for the baseline. + sort_keys : Iterable, optional + An iterable of keys corresponding to the values in `params` that need + re-ordering. Default is (). + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), + or (1, M, N). + reshape_baseline : bool, optional + If True, will reshape the output baseline back into the shape of the input data. If + False (default), will not modify the output baseline shape. + reshape_keys : tuple, optional + The keys within the output parameter dictionary that will need reshaped to match the + shape of the data. For example, used to convert weights for polynomials from 1D back + into the original shape. Default is (). + skip_sorting : bool, optional + If True, will skip sorting the output baseline. The keys in `sort_keys` will + still be sorted. Default is False. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The input `baseline` after re-ordering and setting to the desired dtype. + params : dict + The input `params` after re-ordering the values for `sort_keys`. + + """ + if reshape_baseline: + if ensure_2d: + baseline = baseline.reshape(self._len) + else: + baseline = baseline.reshape(-1, *self._len) + for key in reshape_keys: + if key in params: + # TODO can any params be non-2d that need reshaped? + params[key] = params[key].reshape(self._len) + + if self._sort_order is not None: + for key in sort_keys: + if key in params: # some parameters are conditionally output + # assumes params all all two dimensional arrays + params[key] = params[key][self._inverted_order] + + if not skip_sorting: + baseline = _sort_array2d(baseline, sort_order=self._inverted_order) + baseline = baseline.astype(dtype, copy=False) + + return baseline, params + + @classmethod + def _register(cls, func=None, *, sort_keys=(), dtype=None, order=None, ensure_2d=True, + reshape_baseline=False, reshape_keys=(), skip_sorting=False): + """ + Wraps a baseline function to validate inputs and correct outputs. + + The input data is converted to a numpy array, validated to ensure the length is + consistent, and ordered to match the input x ordering. The outputs are corrected + to ensure proper inverted sort ordering and dtype. + + Parameters + ---------- + func : Callable, optional + The function that is being decorated. Default is None, which returns a partial function. + sort_keys : tuple, optional + The keys within the output parameter dictionary that will need sorting to match the + sort order of :attr:`.x`. Default is (). + dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing of `array`. + order : {None, 'C', 'F'}, optional + The order for the output array. Default is None, which will use the default array + ordering. Other valid options are 'C' for C ordering or 'F' for Fortran ordering. + ensure_2d : bool, optional + If True (default), will raise an error if the shape of `array` is not a two dimensional + array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), + or (1, M, N). + reshape_baseline : bool, optional + If True, will reshape the output baseline back into the shape of the input data. If + False (default), will not modify the output baseline shape. + reshape_keys : tuple, optional + The keys within the output parameter dictionary that will need reshaped to match the + shape of the data. For example, used to convert weights for polynomials from 1D back + into the original shape. Default is (). + skip_sorting : bool, optional + If True, will skip sorting the output baseline. The keys in `sort_keys` will + still be sorted. Default is False. + + Returns + ------- + numpy.ndarray + The calculated baseline. + dict + A dictionary of parameters output by the baseline function. + + """ + if func is None: + return partial( + cls._register, sort_keys=sort_keys, dtype=dtype, order=order, ensure_2d=ensure_2d, + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, + skip_sorting=skip_sorting + ) + + @wraps(func) + def inner(self, data=None, *args, **kwargs): + if data is None: + # not implementing interp_pts for 2D, so data can never + # be None in 2D + raise TypeError('"data" cannot be None') + + reset_x = self.x is not None + reset_z = self.z is not None + if reset_x or reset_z: + if reset_x and reset_z: + expected_shape = self._len + axis = slice(-2, None) + elif reset_x: + expected_shape = self._len[0] + axis = -2 + else: + expected_shape = self._len[1] + axis = -1 + y = _check_sized_array( + data, expected_shape, check_finite=self._check_finite, dtype=dtype, + order=order, ensure_1d=False, axis=axis, name='data', ensure_2d=ensure_2d, + two_d=True + ) + else: + y, self.x, self.z = _yxz_arrays( + data, self.x, self.z, check_finite=self._check_finite, dtype=dtype, + order=order, ensure_2d=ensure_2d + ) + + # update self.x and/or self.z just to ensure dtype and order are correct + if reset_x: + x_dtype = self.x.dtype + self.x = _check_array( + self.x, dtype=dtype, order=order, check_finite=False, ensure_1d=False + ) + else: + self._len[0] = y.shape[-2] + self.x = np.linspace(-1, 1, self._len[0]) + if reset_z: + z_dtype = self.z.dtype + self.z = _check_array( + self.z, dtype=dtype, order=order, check_finite=False, ensure_1d=False + ) + else: + self._len[1] = y.shape[-1] + self.z = np.linspace(-1, 1, self._len[1]) + + if not skip_sorting: + y = _sort_array2d(y, sort_order=self._sort_order) + if self._dtype is None: + output_dtype = y.dtype + else: + output_dtype = self._dtype + + baseline, params = func(self, y, *args, **kwargs) + if reset_x: + self.x = np.array(self.x, dtype=x_dtype, copy=False) + if reset_z: + self.z = np.array(self.z, dtype=z_dtype, copy=False) + + return self._return_results( + baseline, params, dtype=output_dtype, sort_keys=sort_keys, ensure_2d=ensure_2d, + reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, + skip_sorting=skip_sorting + ) + + return inner + + @contextmanager + def _override_x(self, new_x, new_sort_order=None): + """ + Temporarily sets the x-values for the object to a different array. + + Useful when fitting extensions of the x attribute. + + Parameters + ---------- + new_x : numpy.ndarray + The x values to temporarily use. + new_sort_order : [type], optional + The sort order for the new x values. Default is None, which will not sort. + + Yields + ------ + pybaselines._algorithm_setup._Algorithm + The _Algorithm object with the new x attribute. + + """ + raise NotImplementedError + + old_x = self.x + old_len = self._len + old_x_domain = self.x_domain + old_sort_order = self._sort_order + old_inverted_order = self._inverted_order + # also have to reset any sized attributes to force recalculation for new x + old_poly_order = self.poly_order + old_vandermonde = self.vandermonde + old_whittaker_system = self.whittaker_system + old_pspline = self.pspline + + try: + self.x = _check_array(new_x, check_finite=self._check_finite) + self._len = len(self.x) + self.x_domain = np.polynomial.polyutils.getdomain(self.x) + self._sort_order = new_sort_order + if self._sort_order is not None: + self._inverted_order = _inverted_sort(self._sort_order) + else: + self._inverted_order = None + + self.vandermonde = None + self.poly_order = -1 + self.whittaker_system = None + self.pspline = None + + yield self + + finally: + self.x = old_x + self._len = old_len + self.x_domain = old_x_domain + self._sort_order = old_sort_order + self._inverted_order = old_inverted_order + self.vandermonde = old_vandermonde + self.poly_order = old_poly_order + self.whittaker_system = old_whittaker_system + self.pspline = old_pspline + + def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=False, + eigenvalues=None): + """ + Sets the starting parameters for doing penalized least squares. + + Parameters + ---------- + y : numpy.ndarray, shape (M ,N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + lam : float or Sequence[float, float], optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on the penalized least square method + and the differential order. Default is 1. + diff_order : int or Sequence[int, int], optional + The integer differential order; must be greater than 0. Default is 2. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape (M, N) and all values set to 1. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + use_lower : boolean, optional + If True (default), will allow using only the lower non-zero diagonals of + the squared difference matrix. If False, will include all non-zero diagonals. + use_banded : bool, optional + If True, will setup the penalized system using banded matrices. If False, + will use sparse matrices. + + Returns + ------- + y : numpy.ndarray, shape (``M * N``) + The y-values of the measured data after flattening. + weight_array : numpy.ndarray, shape (``M * N``) + The weight array after flattening. + + Raises + ------ + ValueError + Raised is `diff_order` is less than 1. + + Warns + ----- + ParameterWarning + Raised if `diff_order` is greater than 3. + + """ + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + if (diff_order > 3).any(): + warnings.warn( + ('difference orders greater than 3 can have numerical issues;' + ' consider using a difference order of 2 or 1 instead'), + ParameterWarning, stacklevel=2 + ) + weight_array = _check_optional_array( + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + + if ( + self.whittaker_system is not None + and self.whittaker_system.same_basis(diff_order, eigenvalues) + ): + self.whittaker_system.update_penalty(lam) + else: + self.whittaker_system = WhittakerSystem2D( + self._len, lam, diff_order, eigenvalues + ) + if not self.whittaker_system._using_svd: + y = y.ravel() + weight_array = weight_array.ravel() + + return y, weight_array + + def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=False, + calc_pinv=False, copy_weights=False, max_cross=None): + """ + Sets the starting parameters for doing polynomial fitting. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. Default is 2. + calc_vander : bool, optional + If True, will calculate and the Vandermonde matrix. Default is False. + calc_pinv : bool, optional + If True, and if `return_vander` is True, will calculate and return the + pseudo-inverse of the Vandermonde matrix. Default is False. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + y : numpy.ndarray, shape (``M * N``) + The y-values of the measured data after flattening. + weight_array : numpy.ndarray, shape (``M * N``) + The weight array for fitting a polynomial to the data after flattening. + pseudo_inverse : numpy.ndarray + Only returned if `calc_pinv` is True. The pseudo-inverse of the + Vandermonde matrix, calculated with singular value decomposition (SVD). + + Raises + ------ + ValueError + Raised if `calc_pinv` is True and `calc_vander` is False. + + Notes + ----- + Implementation note: the polynomial coefficients, `c`, from solving 2D polynomials + using ``Ac=b`` where `A` is the flattened Vandermonde and `b` is the flattened data + corresponds to the matrix below: + + np.array([ + [x^0*z^0, x^0*z^1, ..., x^0*z^n], + [x^1*z^0, x^1*z^1, ..., x^1*z^n], + [...], + [x^m*z^0, x^m*z^1, ..., x^m*z^n] + ]).flatten() + + """ + weight_array = _check_optional_array( + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + weight_array = weight_array.ravel() + poly_orders = _check_scalar_variable( + poly_order, allow_zero=True, variable_name='polynomial order', two_d=True, dtype=int + ) + if max_cross is not None: + max_cross = _check_scalar_variable( + max_cross, allow_zero=True, variable_name='max_cross', dtype=int + ) + if calc_vander: + if ( + self.vandermonde is None or self._max_cross != max_cross + or np.any(self.poly_order != poly_order) + ): + mapped_x = np.polynomial.polyutils.mapdomain( + self.x, self.x_domain, np.array([-1., 1.]) + ) + mapped_z = np.polynomial.polyutils.mapdomain( + self.z, self.z_domain, np.array([-1., 1.]) + ) + # rearrange the vandermonde such that it matches the typical A c = b where b + # is the flattened version of y and c are the coefficients + self.vandermonde = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), + [poly_orders[0], poly_orders[1]] + ).reshape((-1, (poly_orders[0] + 1) * (poly_orders[1] + 1))) + + if max_cross is not None: + # lists out (z_0, x_0), (z_1, x_0), etc + for idx, val in enumerate( + itertools.product(range(poly_orders[0] + 1), range(poly_orders[1] + 1)) + ): + # 0 designates pure z or x terms + if 0 not in val and any(v > max_cross for v in val): + self.vandermonde[:, idx] = 0 + + self.poly_order = poly_orders + self._max_cross = max_cross + y = y.ravel() + if not calc_pinv: + return y, weight_array + elif not calc_vander: + raise ValueError('if calc_pinv is True, then calc_vander must also be True') + + if weights is None: + pseudo_inverse = np.linalg.pinv(self.vandermonde) + else: + pseudo_inverse = np.linalg.pinv(np.sqrt(weight_array)[:, None] * self.vandermonde) + + return y, weight_array, pseudo_inverse + + def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, + penalized=True, diff_order=3, lam=1, make_basis=True, allow_lower=True, + reverse_diags=False, copy_weights=False): + """ + Sets the starting parameters for doing spline fitting. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + spline_degree : int or Sequence[int, int], optional + The degree of the spline. Default is 3, which is a cubic spline. + num_knots : int or Sequence[int, int], optional + The number of interior knots for the splines. Default is 10. + penalized : bool, optional + Whether the basis matrix should be for a penalized spline or a regular + B-spline. Default is True, which creates the basis for a penalized spline. + diff_order : int or Sequence[int, int], optional + The integer differential order for the spline penalty; must be greater than 0. + Default is 3. Only used if `penalized` is True. + lam : float or Sequence[float, float], optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on the number of knots and the difference order. + Default is 1. + make_basis : bool, optional + If True (default), will create the matrix containing the spline basis functions. + allow_lower : boolean, optional + If True (default), will include only the lower non-zero diagonals of + the squared difference matrix. If False, will include all non-zero diagonals. + reverse_diags : boolean, optional + If True, will reverse the order of the diagonals of the penalty matrix. + Default is False. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + + Returns + ------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + weight_array : numpy.ndarray, shape (M, N) + The weight array for fitting the spline to the data. + + Warns + ----- + ParameterWarning + Raised if `diff_order` is greater than 4. + + Notes + ----- + `degree` is used instead of `order` like for polynomials since the order of a spline + is defined by convention as ``degree + 1``. + + """ + weight_array = _check_optional_array( + self._len, weights, copy_input=copy_weights, check_finite=self._check_finite, + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + if make_basis: + if (diff_order > 4).any(): + warnings.warn( + ('differential orders greater than 4 can have numerical issues;' + ' consider using a differential order of 2 or 3 instead'), + ParameterWarning, stacklevel=2 + ) + + if self.pspline is None or not self.pspline.same_basis(num_knots, spline_degree): + self.pspline = PSpline2D( + self.x, self.z, num_knots, spline_degree, self._check_finite, lam, diff_order + ) + else: + self.pspline.reset_penalty(lam, diff_order) + + return y, weight_array + + def _setup_morphology(self, y, half_window=None, **window_kwargs): + """ + Sets the starting parameters for morphology-based methods. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + half_window : int or Sequence[int, int], optional + The half-window used for the morphology functions. If a value is input, + then that value will be used. Default is None, which will optimize the + half-window size using pybaselines.morphological.optimize_window. + **window_kwargs + Keyword arguments to pass to :func:`.optimize_window`. + Possible items are: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 3. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable half-window size. If None (default), will be + set to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + output_half_window : np.ndarray[int, int] + The accepted half windows. + + Notes + ----- + Ensures that window size is odd since morphological operations operate in + the range [-output_half_window, ..., output_half_window]. + + Half windows are dealt with rather than full window sizes to clarify their + usage. SciPy morphology operations deal with full window sizes. + + """ + if half_window is not None: + output_half_window = _check_half_window(half_window, two_d=True) + else: + output_half_window = optimize_window(y, **window_kwargs) + + return y, output_half_window + + def _setup_smooth(self, y, half_window=0, allow_zero=True, hw_multiplier=2, **pad_kwargs): + """ + Sets the starting parameters for doing smoothing-based algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + half_window : int or Sequence[int, int], optional + The half-window used for the smoothing functions. Used + to pad the left and right edges of the data to reduce edge + effects. Default is 0, which provides no padding. + allow_zero : bool, optional + If True (default), allows `half_window` to be 0; otherwise, `half_window` + must be at least 1. + hw_multiplier : int, optional + The value to multiply the output of :func:`.optimize_window` if half_window + is None. + **pad_kwargs + Additional keyword arguments to pass to :func:`.pad_edges` for padding + the edges of the data to prevent edge effects from smoothing. + + Returns + ------- + numpy.ndarray, shape (``M + 2 * half_window[0]``, ``N + 2 * half_window[1]`) + The padded array of data. + output_hw : np.ndarray[int, int] + The accepted half windows. + + """ + if half_window is not None: + output_hw = _check_half_window(half_window, allow_zero, two_d=True) + else: + output_hw = hw_multiplier * optimize_window(y) + + return pad_edges2d(y, output_hw, **pad_kwargs), output_hw + + def _setup_classification(self, y, weights=None): + """ + Sets the starting parameters for doing classification algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + + Returns + ------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + weight_array : numpy.ndarray, shape (M, N) + The weight array for the data, with boolean dtype. + + """ + weight_array = _check_optional_array( + self._len, weights, check_finite=self._check_finite, dtype=bool, + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and weights is not None: + weight_array = weight_array[self._sort_order] + weight_array = weight_array + + return y, weight_array + + def _get_function(self, method, modules): + """ + Tries to retrieve the indicated function from a list of modules. + + Parameters + ---------- + method : str + The string name of the desired function. Case does not matter. + modules : Sequence + A sequence of modules in which to look for the method. + + Returns + ------- + func : Callable + The corresponding function. + func_module : str + The module that `func` belongs to. + class_object : pybaselines.two_d_algorithm_setup._Algorithm2D + The `_Algorithm2D` object which will be used for fitting. + + Raises + ------ + AttributeError + Raised if no matching function is found within the modules. + + """ + function_string = method.lower() + for module in modules: + func_module = module.__name__.split('.')[-1] + module_class = getattr(module, '_' + func_module.capitalize()) + if hasattr(module_class, function_string): + # if self is a Baseline2D class, can just use its method + if hasattr(self, function_string): + func = getattr(self, function_string) + class_object = self + else: + # have to reset x and z ordering so that all outputs and parameters are + # correctly sorted + if self._sort_order is None: + x = self.x + z = self.z + assume_sorted = True + else: + assume_sorted = False + if isinstance(self._sort_order, tuple): + if self._sort_order[0] is Ellipsis: + x = self.x + z = self.z[self._inverted_order[1]] + else: + x = self.x[self._inverted_order[0][:, 0]] + z = self.z[self._inverted_order[1][0]] + else: + x = self.x[self._inverted_order] + z = self.z + + class_object = module_class( + x, z, check_finite=self._check_finite, assume_sorted=assume_sorted, + output_dtype=self._dtype + ) + func = getattr(class_object, function_string) + break + else: # in case no break + mod_names = [module.__name__ for module in modules] + raise AttributeError(( + f'unknown method "{method}" or method is not within the allowed ' + f'modules: {mod_names}' + )) + + return func, func_module, class_object + + def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=True): + """ + Sets the starting parameters for doing optimizer algorithms. + + Parameters + ---------- + y : numpy.ndarray + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + method : str + The string name of the desired function, like 'asls'. Case does not matter. + modules : Sequence[module, ...] + The modules to search for the indicated `method` function. + method_kwargs : dict, optional + A dictionary of keyword arguments to pass to the fitting function. Default + is None, which uses an emtpy dictionary. + copy_kwargs : bool, optional + If True (default), will copy the input `method_kwargs` so that the input + dictionary is not modified within the function. + + Returns + ------- + y : numpy.ndarray + The y-values of the measured data. + baseline_func : Callable + The function for fitting the baseline. + func_module : str + The string name of the module that contained `fit_func`. + method_kws : dict + A dictionary of keyword arguments to pass to `fit_func`. + class_object : pybaselines._algorithm_setup._Algorithm + The `_Algorithm` object which will be used for fitting. + + Warns + ----- + DeprecationWarning + Passed if `kwargs` is not empty. + + """ + baseline_func, func_module, class_object = self._get_function(method, modules) + if method_kwargs is None: + method_kws = {} + elif copy_kwargs: + method_kws = method_kwargs.copy() + else: + method_kws = method_kwargs + + return y, baseline_func, func_module, method_kws, class_object + + def _setup_misc(self, y): + """ + Sets the starting parameters for doing miscellaneous algorithms. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data, already converted to a numpy + array by :meth:`~_Algorithm2D._register`. + + Returns + ------- + y : numpy.ndarray, shape (M, N) + The y-values of the measured data. + + Notes + ----- + Since the miscellaneous functions are not related, the only use of this + function is for aliasing the input `data` to `y`. + + """ + return y diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py new file mode 100644 index 0000000..fb10bfe --- /dev/null +++ b/pybaselines/two_d/_spline_utils.py @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +"""Helper functions for using splines. + +Created on April 25, 2023 +@author: Donald Erb + +""" + +import numpy as np +from scipy.sparse import kron +from scipy.sparse.linalg import spsolve + +from .._compat import csr_object +from .._spline_utils import _spline_basis, _spline_knots +from .._validation import _check_array, _check_scalar_variable +from ._whittaker_utils import PenalizedSystem2D, _face_splitting + + +class PSpline2D(PenalizedSystem2D): + """ + A Penalized Spline, which penalizes the difference of the spline coefficients. + + Penalized splines (P-Splines) are solved with the following equation + ``(B.T @ W @ B + P) c = B.T @ W @ y`` where `c` is the spline coefficients, `B` is the + spline basis, the weights are the diagonal of `W`, the penalty is `P`, and `y` is the + fit data. The penalty `P` is usually in the form ``lam * D.T @ D``, where `lam` is a + penalty factor and `D` is the matrix version of the finite difference operator. + + Attributes + ---------- + basis_r : scipy.sparse.csr.csr_matrix, shape (N, P) + The spline basis for the rows. Has a shape of (`N,` `P`), where `N` is the number of + points in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[0] + spline_degree[0] - 1``). + basis_c : scipy.sparse.csr.csr_matrix, shape (M, Q) + The spline basis for the columns. Has a shape of (`M,` `Q`), where `M` is the number of + points in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[1] + spline_degree[1] - 1``). + coef : None or numpy.ndarray, shape (M,) + The spline coefficients. Is None if :meth:`~PSpline2D.solve_pspline` has not been called + at least once. + knots_r : numpy.ndarray, shape (K,) + The knots for the spline along the rows. Has a shape of `K`, which is equal to + ``num_knots[0] + 2 * spline_degree[0]``. + knots_c : numpy.ndarray, shape (L,) + The knots for the spline along the columns. Has a shape of `L`, which is equal to + ``num_knots[1] + 2 * spline_degree[2]``. + num_knots : numpy.ndarray([int, int]) + The number of internal knots (including the endpoints) for x and z. The total number of + knots for the spline, `K`, is equal to ``num_knots + 2 * spline_degree``. + spline_degree : numpy.ndarray([int, int]) + The degree of the spline (eg. a cubic spline would have a `spline_degree` of 3) for + x and z. + x : numpy.ndarray, shape (N,) + The x-values for the spline. + z : numpy.ndarray, shape (M,) + The z-values for the spline. + + Notes + ----- + If the penalty is symmetric, the sparse system could be solved much faster using + CHOLMOD from SuiteSparse (https://github.com/DrTimothyAldenDavis/SuiteSparse) through + the python bindings provided by scikit-sparse (https://github.com/scikit-sparse/scikit-sparse), + but it is not worth implementing here since this code will rarely be used. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + + def __init__(self, x, z, num_knots=100, spline_degree=3, check_finite=False, lam=1, + diff_order=2): + """ + Initializes the penalized spline by calculating the basis and penalty. + + Parameters + ---------- + x : array-like, shape (N,) + The x-values for the spline. + z : array-like, shape (M,) + The z-values for the spline. + num_knots : int or Sequence[int, int], optional + The number of internal knots for the spline, including the endpoints. + Default is 100. + spline_degree : int or Sequence[int, int], optional + The degree of the spline. Default is 3, which is a cubic spline. + check_finite : bool, optional + If True, will raise an error if any values in `x` are not finite. Default + is False, which skips the check. + lam : float or Sequence[float, float], optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty. Default is 2 (second order difference). + + Raises + ------ + ValueError + Raised if `spline_degree` is less than 0 or if `diff_order` is less than 1 + or greater than or equal to the number of spline basis functions + (``num_knots + spline_degree - 1``). + + """ + self.coef = None + self._basis = None + + self.x = _check_array(x, dtype=float, check_finite=check_finite, ensure_1d=True) + self.z = _check_array(z, dtype=float, check_finite=check_finite, ensure_1d=True) + + self.num_knots = _check_scalar_variable( + num_knots, allow_zero=False, variable_name='number of knots', two_d=True, dtype=int + ) + self.spline_degree = _check_scalar_variable( + spline_degree, allow_zero=True, variable_name='spline degree', two_d=True, dtype=int + ) + + self.knots_r = _spline_knots(self.x, self.num_knots[0], self.spline_degree[0], True) + self.basis_r = _spline_basis(self.x, self.knots_r, self.spline_degree[0]) + + self.knots_c = _spline_knots(self.z, self.num_knots[1], self.spline_degree[1], True) + self.basis_c = _spline_basis(self.z, self.knots_c, self.spline_degree[1]) + + super().__init__((self.basis_r.shape[1], self.basis_c.shape[1]), lam, diff_order) + + if (self.diff_order >= self._num_bases).any(): + raise ValueError(( + 'the difference order must be less than the number of basis ' + 'functions, which is the number of knots + spline degree - 1' + )) + + self._G_r = _face_splitting(self.basis_r) + self._G_c = _face_splitting(self.basis_c) + + def same_basis(self, num_knots=100, spline_degree=3): + """ + Sees if the current basis is equivalent to the input number of knots of spline degree. + + Parameters + ---------- + num_knots : int or Sequence[int, int], optional + The number of knots for the new spline. Default is 100. + spline_degree : int or Sequence[int, int], optional + The degree of the new spline. Default is 3. + + Returns + ------- + bool + True if the input number of knots and spline degree are equivalent to the current + spline basis of the object. + + """ + # TODO should give a way to update only one of the basis functions, which + # would also need to update the penalty + num_knots = _check_scalar_variable( + num_knots, allow_zero=False, variable_name='number of knots', two_d=True, dtype=int + ) + spline_degree = _check_scalar_variable( + spline_degree, allow_zero=True, variable_name='spline degree', two_d=True, dtype=int + ) + + return ( + np.array_equal(num_knots, self.num_knots) + and np.array_equal(spline_degree, self.spline_degree) + ) + + def reset_penalty(self, lam=1, diff_order=2): + """ + Resets the penalty of the system and all of the attributes. + + Useful for reusing the penalty diagonals without having to recalculate the spline basis. + + Parameters + ---------- + lam : float or Sequence[float, float], optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty. Default is 2 (second order difference). + + """ + self.reset_diagonals(lam, diff_order) + + def solve(self, y, weights, penalty=None, rhs_extra=None): + """ + Solves the coefficients for a weighted penalized spline. + + Solves the linear equation ``(B.T @ W @ B + P) c = B.T @ W @ y`` for the spline + coefficients, `c`, given the spline basis, `B`, the weights (diagonal of `W`), the + penalty `P`, and `y`, and returns the resulting spline, ``B @ c``. Attempts to + calculate ``B.T @ W @ B`` and ``B.T @ W @ y`` as a banded system to speed up + the calculation. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values for fitting the spline. + weights : numpy.ndarray, shape (M, N) + The weights for each y-value. + penalty : numpy.ndarray, shape (``M * N``, ``M * N``) + The finite difference penalty matrix, in LAPACK's lower banded format (see + :func:`scipy.linalg.solveh_banded`) if `lower_only` is True or the full banded + format (see :func:`scipy.linalg.solve_banded`) if `lower_only` is False. + rhs_extra : float or numpy.ndarray, shape (``M * N``,), optional + If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (M, N) + The spline, corresponding to ``B @ c``, where `c` are the solved spline + coefficients and `B` is the spline basis. + + Notes + ----- + Uses the more efficient algorithm from Eilers's paper, although the memory usage + is higher than the straigtforward method when the number of knots is high; however, + it is significantly faster and memory efficient when the number of knots is lower, + which will be the more typical use case. + + """ + # do not save intermediate results since they are memory intensive for high number of knots + F = csr_object( + np.transpose( + (self._G_r.T @ weights @ self._G_c).reshape( + (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) + ), + [0, 2, 1, 3] + ).reshape( + (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + ) + ) + if penalty is None: + penalty = self.penalty + + rhs = (self.basis_r.T @ (weights * y) @ self.basis_c).ravel() + if rhs_extra is not None: + rhs = rhs + rhs_extra + + self.coef = spsolve(F + penalty, rhs) + output = self.basis_r @ self.coef.reshape(self._num_bases) @ self.basis_c.T + + return output + + @property + def basis(self): + """ + The full spline basis matrix. + + This is a lazy implementation since the full basis is typically not needed for + computations. + + """ + if self._basis is None: + self._basis = kron(self.basis_r, self.basis_c) + return self._basis + + @property + def tck(self): + """ + The knots, spline coefficients, and spline degree to reconstruct the spline. + + Convenience function for easily reconstructing the last solved spline with outside + modules, such as with Scipy's `NdBSpline`, to allow for other usages such as evaulating + with different x- and z-values. + + Raises + ------ + ValueError + Raised if `solve_pspline` has not been called yet, meaning that the spline has not + yet been constructed. + + Notes + ----- + To use with :class:`scipy.interpolate.NdBSpline`, the setup would look like: + + from scipy.interpolate import NdBspline + pspline = Pspline2D(x, z, ...) + pspline_fit = pspline.solve(...) + XZ = np.array(np.meshgrid(x, z)).T # same as zipping the meshgrid and rearranging + fit = NdBSpline(pspline.tck)(XZ) # fit == pspline_fit + + """ + if self.coef is None: + raise ValueError('No spline coefficients, need to call "solve_pspline" first.') + return ( + (self.knots_r, self.knots_c), self.coef.reshape(self._num_bases), self.spline_degree + ) diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py new file mode 100644 index 0000000..5052e71 --- /dev/null +++ b/pybaselines/two_d/_whittaker_utils.py @@ -0,0 +1,622 @@ +# -*- coding: utf-8 -*- +"""Helper functions for working with penalized linear systems. + +Created on April 30, 2023 +@author: Donald Erb + +""" + +import warnings + +import numpy as np +from scipy.linalg import eig_banded, eigh_tridiagonal, solve +from scipy.sparse import kron +from scipy.sparse.linalg import spsolve + +from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix +from .._compat import identity +from .._validation import _check_lam, _check_scalar, _check_scalar_variable +from ..utils import ParameterWarning + + +def _face_splitting(basis): + """ + Performs the face-splitting product on the input two dimensional basis matrix. + + Parameters + ---------- + basis : numpy.ndarray or scipy.sparse.spmatrix or scipy.sparse._sparray + The two dimensional dense or sparse matrix, with shape (`M`, `N`). + + Returns + ------- + scipy.sparse.spmatrix or scipy.sparse._sparray + The face-splitting product of the input basis matrix with itself, with + shape (`M`, `N**2`). + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + https://en.wikipedia.org/wiki/Khatri%E2%80%93Rao_product#Face-splitting_product + + """ + ones = np.ones((1, basis.shape[1])) + return kron(basis, ones).multiply(kron(ones, basis)) + + +class PenalizedSystem2D: + """ + An object for setting up and solving penalized least squares linear systems. + + Attributes + ---------- + diff_order : numpy.ndarray[int, int] + The difference order of the penalty. + main_diagonal : numpy.ndarray + The values along the main diagonal of the penalty matrix. + penalty : scipy.sparse.base.spmatrix + The current penalty. Originally is `original_diagonals` after multiplying by `lam` + and applying padding, but can also be changed by calling + :meth:`~PenalizedSystem2D.add_penalty`. Reset by calling + :meth:`~PenalizedSystem2D.reset_diagonals`. + + Notes + ----- + If the penalty is symmetric, the sparse system could be solved much faster using + CHOLMOD from SuiteSparse (https://github.com/DrTimothyAldenDavis/SuiteSparse) through + the python bindings provided by scikit-sparse (https://github.com/scikit-sparse/scikit-sparse), + but it is not worth implementing here since this code will rarely be used. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + + def __init__(self, data_size, lam=1, diff_order=2): + """ + Initializes the banded system. + + Parameters + ---------- + data_size : Sequence[int, int] + The number of data points for the system. + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + + """ + self._num_bases = data_size + self.reset_diagonals(lam, diff_order) + + def add_penalty(self, penalty): + """ + Updates `self.penalty` with an additional penalty and updates the bands. + + Parameters + ---------- + penalty : array-like + The additional penalty to add to `self.penalty`. + + Returns + ------- + numpy.ndarray + The updated `self.penalty`. + + """ + self.penalty = self.penalty + penalty + self._update_bands() + + return self.penalty + + def _update_bands(self): + """ + Updates the number of bands and the index of the main diagonal in `self.penalty`. + + Only relevant if setup as a banded matrix. + + """ + self.main_diagonal = self.penalty.diagonal() + + def reset_diagonals(self, lam=1, diff_order=2): + """ + Resets the diagonals of the system and all of the attributes. + + Useful for reusing the penalized system for a different `lam` value. + + Parameters + ---------- + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + + """ + self.diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + self.lam = _check_lam(lam, two_d=True) + + penalty_rows = diff_penalty_matrix(self._num_bases[0], self.diff_order[0]) + penalty_columns = diff_penalty_matrix(self._num_bases[1], self.diff_order[1]) + + # multiplying lam by the Kronecker product is the same as multiplying just D.T @ D with lam + P_rows = kron(self.lam[0] * penalty_rows, identity(self._num_bases[1])) + P_columns = kron(identity(self._num_bases[0]), self.lam[1] * penalty_columns) + self.penalty = P_rows + P_columns + + self._update_bands() + + def solve(self, y, weights, penalty=None, rhs_extra=None): + """ + Solves the equation ``A @ x = b``. + + Parameters + ---------- + y : numpy.ndarray + The y-values for fitting the spline. + weights : numpy.ndarray + The weights for each y-value. Will also be added to the diagonal of the + penalty. + penalty : numpy.ndarray + The penalty to use for solving. Default is None which uses the object's + penalty. + rhs_extra : float or numpy.ndarray, optional + If supplied, `rhs_extra` will be added to the right hand side + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (N,) + The solution to the linear system, `x`. + + """ + if penalty is None: + lhs = self.add_diagonal(weights) + else: + penalty.setdiag(penalty.diagonal() + weights) + lhs = penalty + rhs = weights * y + if rhs_extra is not None: + rhs = rhs + rhs_extra + + return self.direct_solve(lhs, rhs) + + def direct_solve(self, lhs, rhs): + return spsolve(lhs, rhs) + + def add_diagonal(self, value): + """ + Adds a diagonal array to the original penalty matrix. + + Parameters + ---------- + value : float or numpy.ndarray + The diagonal array to add to the penalty matrix. + + Returns + ------- + scipy.sparse.base.spmatrix + The penalty matrix with the main diagonal updated. + + """ + self.penalty.setdiag(self.main_diagonal + value) + return self.penalty + + def reset_diagonal(self): + """Sets the main diagonal of the penalty matrix back to its original value.""" + self.penalty.setdiag(self.main_diagonal) + + +class WhittakerSystem2D(PenalizedSystem2D): + """ + Sets up and solves Whittaker smoothing using the analytical solution or eigendecomposition. + + Attributes + ---------- + basis_r : scipy.sparse.csr.csr_matrix, shape (N, P) + The spline basis for the rows. Has a shape of (`N,` `P`), where `N` is the number of + points in `x`, and `P` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[0] + spline_degree[0] - 1``). + basis_c : scipy.sparse.csr.csr_matrix, shape (M, Q) + The spline basis for the columns. Has a shape of (`M,` `Q`), where `M` is the number of + points in `z`, and `Q` is the number of basis functions (equal to ``K - spline_degree - 1`` + or equivalently ``num_knots[1] + spline_degree[1] - 1``). + coef : None or numpy.ndarray, shape (M,) + The spline coefficients. Is None if :meth:`~PSpline2D.solve_pspline` has not been called + at least once. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + + def __init__(self, data_size, lam=1, diff_order=2, max_eigens=None): + """ + Initializes the penalized spline by calculating the basis and penalty. + + Parameters + ---------- + data_size : Sequence[int, int] + The number of data points for the system. + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + max_eigens : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for the eigendecomposition. If None, will solve the linear system using the full + analytical solution, which is typically much slower. + + """ + # TODO should figure out a way to better merge PenalizedSystem2D, PSpline2D, and this class + self.coef = None + self._basis = None + self._num_points = data_size + max_eigens = _check_scalar(max_eigens, 2, fill_scalar=True)[0] + if (max_eigens == np.array([None, None])).all(): + self._num_bases = data_size + self._using_svd = False + elif None in max_eigens: + raise ValueError('eigenvalues must be None or non-None integers') + else: + self._num_bases = _check_scalar_variable( + max_eigens, allow_zero=False, variable_name='eigenvalues', two_d=True, dtype=int + ) + self._using_svd = True + self.reset_diagonals(lam, diff_order) + + def reset_diagonals(self, lam=1, diff_order=2): + """ + Resets the diagonals of the system and all of the attributes. + + Useful for reusing the penalized system for a different `lam` value. + + Parameters + ---------- + lam : float or Sequence[int, int], optional + The penalty factor applied to the difference matrix for the rows and columns, + respectively. If a single value is given, both will use the same value. Larger + values produce smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + + """ + if not self._using_svd: + super().reset_diagonals(lam, diff_order) + return + + self.diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + self.lam = _check_lam(lam, two_d=True) + + # initially need num_bases to point to the data shape; maybe set a second + # attribute insteaad + values_rows, vectors_rows = self._calc_eigenvalues( + self._num_points[0], self.diff_order[0], self._num_bases[0] + ) + # TODO if all else matches, just calc the max eigens and use indexing for the lower one + if ( + self.diff_order[0] == self.diff_order[1] + and self._num_points[0] == self._num_points[1] + and self._num_bases[0] == self._num_bases[1] + ): + values_columns, vectors_columns = values_rows, vectors_rows + else: + values_columns, vectors_columns = self._calc_eigenvalues( + self._num_points[1], self.diff_order[1], self._num_bases[1] + ) + # the eigenvalues are a diagonal matrix, so can simplify since + # kron(diagonal, identity(N)) == np.repeat(diagonal, N) and + # kron(identity(M), diaonal2) == np.tile(diagonal2, M) + self.penalty_rows = np.repeat(self.lam[0] * values_rows, self._num_bases[1]) + self.penalty_columns = np.tile(self.lam[1] * values_columns, self._num_bases[0]) + # penalty is a (_num_bases[0] * _num_bases[1],) array + self.penalty = self.penalty_rows + self.penalty_columns + + self.basis_r = vectors_rows + self.basis_c = vectors_columns + + self._G_r = _face_splitting(self.basis_r) + self._G_c = _face_splitting(self.basis_c) + + def _calc_eigenvalues(self, data_points, diff_order, num_eigens): + """ + Calculate the eigenvalues and eigenvectors for the corresponding penalty matrix. + + Parameters + ---------- + data_points : int + The number of rows and columns of the square penalty matrix. + diff_order : int + The difference order of the penalty. + num_eigens : int + The number of smallest eigenvalues that will be used to represent the penalty matrix. + + Returns + ------- + eigenvalues : np.ndarray, shape (`num_eigens`,) + The eigenvalues of the penalty matrix for the corresponding difference order. + eigenvectors : np.ndarray, shape (`data_points`, `num_eigens`) + The eigenvectors for the penalty matrix. + + Raises + ------ + ValueError + Raised if the number of eigenvalues is greater than the number of data + points. + + Warns + ----- + ParameterWarning + If `num_eigens` is less than or equal to `diff_order`, a warning is issue since + the diagonals of the resulting matrix will no longer be guaranteed to be + positive-definite. Is also emitted if `num_eigens` is greater than 50 since + for 2D baseline correction, less than 20 eigenvalues is typically required. + + Notes + ----- + The lowest `diff_order` eigenvalues are supposed to be zero while they end up + being ~ +- 1e-15, so their values are set to 0. + + The penalty matrix has a matrix rank (number of nonzero eigenvalues) of + ``data_points - diff_order``. The lowest `diff_order` eigenvalues are all + zero, so the system is not guaranteed to be positive definite when solving the + penalized least squares fit unless all weights are >~ 1e-5 (just a guess, but + the meaning is that weights must be some magnitude greater than zero), which is + not guaranteed for all Whittaker-smoothing-based algorithms. Thus, a clear + warning needs to be issued since otherwise this detail can be hidden. + + References + ---------- + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + penalty_bands = diff_penalty_diagonals(data_points, diff_order, lower_only=True) + if num_eigens > data_points: + raise ValueError(( + 'The maximum number of eigenvalues cannot be greater ' + 'than the number of data points.' + )) + elif num_eigens <= diff_order: + warnings.warn( + ('Setting the number of eigenvalues to be greater than the difference order ' + 'in order to not cause numerical instability'), ParameterWarning, stacklevel=2 + ) + elif num_eigens > 50: + warnings.warn( + ('For 2D baseline correction, typically only 5-20 eigenvalues are required to ' + 'fully approximate the baseline, and higher values will cause signifcant ' + 'slowdown'), ParameterWarning, stacklevel=2 + ) + + if diff_order == 1: + eigenvalues, eigenvectors = eigh_tridiagonal( + penalty_bands[0], penalty_bands[1, :-1], select='i', + select_range=(0, num_eigens - 1) + ) + else: + eigenvalues, eigenvectors = eig_banded( + penalty_bands, lower=True, select='i', + select_range=(0, num_eigens - 1), overwrite_a_band=True + ) + + # TODO do the corresponding eigenvectors in eigenvectors[:, :diff_order] need updated + # too to match the resetting of the eigenvalues? + eigenvalues[:diff_order] = 0 + + return eigenvalues, eigenvectors + + def update_penalty(self, lam): + if not self._using_svd: + raise ValueError('Must call reset_diagonals if not using eigendecomposition') + lam = _check_lam(lam, two_d=True) + self.penalty_rows = (lam[0] / self.lam[0]) * self.penalty_rows + self.penalty_columns = (lam[1] / self.lam[1]) * self.penalty_columns + + self.lam = lam + self.penalty = self.penalty_rows + self.penalty_columns + + def same_basis(self, diff_order=2, max_eigens=None): + """ + Sees if the current basis is equivalent to the input number of eigenvalues and diff order. + + Always returns False if the previous setup did not use eigendecomposition or if + the input maximum number of eigenvalues is None. + + Parameters + ---------- + diff_order : int or Sequence[int, int], optional + The difference order of the penalty for the rows and columns, respectively. If + a single value is given, both will use the same value. + Default is 2 (second order difference). + max_eigens : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for the eigendecomposition. If None, will solve the linear system using the full + analytical solution, which is typically much slower. + + Returns + ------- + bool + True if the input number of eigenvalues and difference order are equivalent to the + current setup for the object. + + """ + # TODO should give a way to update only one of the basis functions, which + # would also need to update the penalty + max_eigens = _check_scalar(max_eigens, 2, fill_scalar=True)[0] + if (max_eigens == np.array([None, None])).all() or not self._using_svd: + return False + + diff_order = _check_scalar_variable( + diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int + ) + + max_eigens = _check_scalar_variable( + max_eigens, allow_zero=False, variable_name='eigenvalues', two_d=True, dtype=int + ) + return ( + np.array_equal(diff_order, self.diff_order) + and np.array_equal(max_eigens, self._num_bases) + ) + + def reset_penalty(self, lam=1, diff_order=2): + """ + Resets the penalty of the system and all of the attributes. + + Useful for reusing the penalty diagonals without having to recalculate the spline basis. + + Parameters + ---------- + lam : float or Sequence[float, float], optional + The penalty factor applied to the difference matrix. Larger values produce + smoother results. Must be greater than 0. Default is 1. + diff_order : int or Sequence[int, int], optional + The difference order of the penalty. Default is 2 (second order difference). + + """ + # TODO is this even needed? + self.reset_diagonals(lam, diff_order) + + def _make_btwb(self, weights): + """Computes ``Basis.T @ Weights @ Basis`` using a more efficient method. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + # do not save intermediate results since they are memory intensive for high number of bases + # note to self: F is not sparse when the basis functions are eigenvectors since the + # eigenvector matrices are fully dense; it is however symmetric and positive definite + F = np.transpose( + (self._G_r.T @ weights @ self._G_c).reshape( + (self._num_bases[0], self._num_bases[0], self._num_bases[1], self._num_bases[1]) + ), + [0, 2, 1, 3] + ).reshape( + (self._num_bases[0] * self._num_bases[1], self._num_bases[0] * self._num_bases[1]) + ) + + return F + + def solve(self, y, weights, penalty=None, rhs_extra=None, assume_a='pos'): + """ + Solves the coefficients for a weighted penalized spline. + + Solves the linear equation ``(B.T @ W @ B + P) c = B.T @ W @ y`` for the spline + coefficients, `c`, given the spline basis, `B`, the weights (diagonal of `W`), the + penalty `P`, and `y`, and returns the resulting spline, ``B @ c``. Attempts to + calculate ``B.T @ W @ B`` and ``B.T @ W @ y`` as a banded system to speed up + the calculation. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The y-values for fitting the spline. + weights : numpy.ndarray, shape (M, N) + The weights for each y-value. + penalty : numpy.ndarray, shape (``M * N``, ``M * N``) + The finite difference penalty matrix, in LAPACK's lower banded format (see + :func:`scipy.linalg.solveh_banded`) if `lower_only` is True or the full banded + format (see :func:`scipy.linalg.solve_banded`) if `lower_only` is False. + rhs_extra : float or numpy.ndarray, shape (``M * N``,), optional + If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (M, N) + The spline, corresponding to ``B @ c``, where `c` are the solved spline + coefficients and `B` is the spline basis. + + Notes + ----- + Uses the more efficient algorithm from Eilers's paper, although the memory usage + is higher than the straigtforward method when the number of knots is high; however, + it is significantly faster and memory efficient when the number of knots is lower, + which will be the more typical use case. + + """ + if not self._using_svd: + return super().solve(y, weights, penalty, rhs_extra) + + rhs = (self.basis_r.T @ (weights * y) @ self.basis_c).ravel() + if rhs_extra is not None: + rhs = rhs + rhs_extra + + if penalty is None: + penalty = self.penalty + + lhs = self._make_btwb(weights) + # TODO could use cho_factor and save the factorization to call within _calc_dof to make + # the call save time since it would only be used after the weights are finalized + np.fill_diagonal(lhs, lhs.diagonal() + penalty) + self.coef = solve( + lhs, rhs, lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, + assume_a=assume_a + ) + + output = self.basis_r @ self.coef.reshape(self._num_bases) @ self.basis_c.T + + return output + + @property + def basis(self): + """ + The full spline basis matrix. + + This is a lazy implementation since the full basis is typically not needed for + computations. + + """ + if not self._using_svd: + # Could maybe just make a basis using identities? But this should not be called + # from outside so no reason to implement + raise ValueError('No basis matrix when not using eigendecomposition') + + if self._basis is None: + self._basis = kron(self.basis_r, self.basis_c) + return self._basis + + def _calc_dof(self, weights, assume_a='pos'): + if not self._using_svd: + # Could maybe just output a matrix of ones? + raise ValueError( + 'Cannot calculate degrees of freedom when not using eigendecomposition' + ) + lhs = self._make_btwb(weights) + rhs = lhs.copy() + np.fill_diagonal(lhs, lhs.diagonal() + self.penalty) + dof = solve( + lhs, rhs, lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, + assume_a=assume_a + ) + + return dof.diagonal().reshape(self._num_bases) diff --git a/pybaselines/two_d/api.py b/pybaselines/two_d/api.py new file mode 100644 index 0000000..a48354e --- /dev/null +++ b/pybaselines/two_d/api.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +"""The main entry point for using the object oriented api of pybaselines. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +from .morphological import _Morphological +from .optimizers import _Optimizers +from .polynomial import _Polynomial +from .smooth import _Smooth +from .spline import _Spline +from .whittaker import _Whittaker + + +class Baseline2D( + _Morphological, _Optimizers, _Polynomial, _Smooth, _Spline, _Whittaker +): + """ + A class for all 2D baseline correction algorithms. + + Contains all available 2D baseline correction algorithms in pybaselines as methods to + allow a single interface for easier usage. + + Parameters + ---------- + x_data : array-like, shape (M,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + z_data : array-like, shape (N,), optional + The z-values of the measured data. Default is None, which will create an + array from -1 to 1 during the first function call with length equal to the + input data length. + check_finite : bool, optional + If True (default), will raise an error if any values in input data are not finite. + Setting to False will skip the check. Note that errors may occur if + `check_finite` is False and the input data contains non-finite values. + output_dtype : type or numpy.dtype, optional + The dtype to cast the output array. Default is None, which uses the typing + of the input data. + + Attributes + ---------- + poly_order : Sequence[int, int] + The last polynomial order used for a polynomial algorithm. Initially is -1, denoting + that no polynomial fitting has been performed. + pspline : pybaselines.two_d._spline_utils.PSpline2D or None + The PSpline object for setting up and solving penalized spline algorithms. Is None + if no penalized spline setup has been performed. + vandermonde : numpy.ndarray or None + The Vandermonde matrix for solving polynomial equations. Is None if no polynomial + setup has been performed. + whittaker_system : pybaselines.two_d._banded_utils.PenalizedSystem2D or None + The PenalizedSystem object for setting up and solving Whittaker-smoothing-based + algorithms. Is None if no Whittaker setup has been performed. + x : numpy.ndarray or None + The x-values for the object. If initialized with None, then `x` is initialized the + first function call to have the same size as the input `data.shape[-2]` and has min + and max values of -1 and 1, respectively. + x_domain : numpy.ndarray + The minimum and maximum values of `x`. If `x_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + z : numpy.ndarray or None + The z-values for the object. If initialized with None, then `z` is initialized the + first function call to have the same size as the input `data.shape[-1]` and has min + and max values of -1 and 1, respectively. + z_domain : numpy.ndarray + The minimum and maximum values of `z`. If `z_data` is None during initialization, then + set to numpy.ndarray([-1, 1]). + + """ + + def _get_method(self, method): + """ + A helper function to allow accessing methods by their string. + + Parameters + ---------- + method : str + The name of the desired method as a string. Capitalization is ignored. For + example, both 'asls' and 'AsLS' would return :meth:`~.Baseline2D.asls`. + + Returns + ------- + output : Callable + The callable method corresponding to the input string. + + Raises + ------ + AttributeError + Raised if the input method does not exist. + + """ + method_string = method.lower() + if hasattr(self, method_string): + output = getattr(self, method_string) + else: + raise AttributeError(f'unknown method "{method}"') + + return output diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py new file mode 100644 index 0000000..7deed79 --- /dev/null +++ b/pybaselines/two_d/morphological.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +"""Morphological techniques for fitting baselines to experimental data. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +import numpy as np +from scipy.ndimage import grey_dilation, grey_erosion, grey_opening, uniform_filter + +from ._algorithm_setup import _Algorithm2D +from ..utils import relative_difference +from .._validation import _check_half_window + + +class _Morphological(_Algorithm2D): + """A base class for all morphological algorithms.""" + + @_Algorithm2D._register + def mor(self, data, half_window=None, **window_kwargs): + """ + A Morphological based (Mor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + opening = grey_opening(y, 2 * half_wind + 1) + baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) + + return baseline, {'half_window': half_wind} + + @_Algorithm2D._register + def imor(self, data, half_window=None, tol=1e-3, max_iter=200, **window_kwargs): + """ + An Improved Morphological based (IMor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 200. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Dai, L., et al. An Automated Baseline Correction Method Based on Iterative + Morphological Operations. Applied Spectroscopy, 2018, 72(5), 731-739. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + baseline = y + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline_new = np.minimum(y, _avg_opening(baseline, half_wind)) + calc_difference = relative_difference(baseline, baseline_new) + tol_history[i] = calc_difference + if calc_difference < tol: + break + baseline = baseline_new + + params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} + return baseline, params + + @_Algorithm2D._register + def rolling_ball(self, data, half_window=None, smooth_half_window=None, + pad_kwargs=None, **window_kwargs): + """ + The rolling ball baseline algorithm. + + Applies a minimum and then maximum moving window, and subsequently smooths the + result, giving a baseline that resembles rolling a ball across the data. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. + smooth_half_window : int, optional + The half-window to use for smoothing the data after performing the + morphological operation. Default is None, which will use the same + value as used for the morphological operation. + pad_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.pad_edges` for + padding the edges of the data to prevent edge effects from the moving average. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + + References + ---------- + Kneen, M.A., et al. Algorithm for fitting XRF, SEM and PIXE X-ray spectra + backgrounds. Nuclear Instruments and Methods in Physics Research B, 1996, + 109, 209-213. + + Liland, K., et al. Optimal Choice of Baseline Correction for Multivariate + Calibration of Spectra. Applied Spectroscopy, 2010, 64(9), 1007-1016. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + if smooth_half_window is None: + smooth_half_window = half_wind + else: + smooth_half_window = _check_half_window(smooth_half_window, allow_zero=True, two_d=True) + + rough_baseline = grey_opening(y, 2 * half_wind + 1) + baseline = uniform_filter( + rough_baseline, 2 * smooth_half_window + 1 + ) + + return baseline, {'half_window': half_wind} + + @_Algorithm2D._register + def tophat(self, data, half_window=None, **window_kwargs): + """ + Estimates the baseline using a top-hat transformation (morphological opening). + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.optimize_window` and `window_kwargs`. + **window_kwargs + Values for setting the half window used for the morphology operations. + Items include: + + * 'increment': int + The step size for iterating half windows. Default is 1. + * 'max_hits': int + The number of consecutive half windows that must produce the same + morphological opening before accepting the half window as the + optimum value. Default is 1. + * 'window_tol': float + The tolerance value for considering two morphological openings as + equivalent. Default is 1e-6. + * 'max_half_window': int + The maximum allowable window size. If None (default), will be set + to (len(data) - 1) / 2. + * 'min_half_window': int + The minimum half-window size. If None (default), will be set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + + Notes + ----- + The actual top-hat transformation is defined as `data - opening(data)`, where + `opening` is the morphological opening operation. This function, however, returns + `opening(data)`, since that is technically the baseline defined by the operation. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, **window_kwargs) + baseline = grey_opening(y, 2 * half_wind + 1) + + return baseline, {'half_window': half_wind} + + +def _avg_opening(y, half_window, opening=None): + """ + Averages the dilation and erosion of a morphological opening on data. + + Parameters + ---------- + y : numpy.ndarray, shape (M, N) + The array of the measured data. + half_window : numpy.ndarray([int, int]), optional + The half window size for the rows and columns, respectively, to use for the operations. + opening : numpy.ndarray, optional + The output of scipy.ndimage.grey_opening(y, window_size). Default is + None, which will compute the value. + + Returns + ------- + numpy.ndarray, shape (M, N) + The average of the dilation and erosion of the opening. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. + + """ + # TODO should find a way to merge this with its 1D counterpart + window_size = 2 * half_window + 1 + if opening is None: + opening = grey_opening(y, window_size) + return 0.5 * ( + grey_dilation(opening, window_size) + + grey_erosion(opening, window_size) + ) diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py new file mode 100644 index 0000000..3404ca2 --- /dev/null +++ b/pybaselines/two_d/optimizers.py @@ -0,0 +1,317 @@ +# -*- coding: utf-8 -*- +"""High level functions for making better use of baseline algorithms. + +Functions in this module make use of other baseline algorithms in +pybaselines to provide better results or optimize parameters. + +Created on January 14, 2024 +@author: Donald Erb + +""" + +from math import ceil + +import numpy as np + +from . import morphological, polynomial, spline, whittaker +from ._algorithm_setup import _Algorithm2D +from .._validation import _check_optional_array, _get_row_col_values +from ..utils import _check_scalar, _sort_array2d + + +class _Optimizers(_Algorithm2D): + """A base class for all optimizer algorithms.""" + + @_Algorithm2D._register(ensure_2d=False, skip_sorting=True) + def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): + """ + Collaborative Penalized Least Squares (collab-PLS). + + Averages the data or the fit weights for an entire dataset to get more + optimal results. Uses any Whittaker-smoothing-based or weighted spline algorithm. + + Parameters + ---------- + data : array-like, shape (L, M, N) + An array with shape (L, M, N) where L is the number of entries in + the dataset and (M, N) is the shape of each data entry. + average_dataset : bool, optional + If True (default) will average the dataset before fitting to get the + weighting. If False, will fit each individual entry in the dataset and + then average the weights to get the weighting for the dataset. + method : str, optional + A string indicating the Whittaker-smoothing-based or weighted spline method to + use for fitting the baseline. Default is 'asls'. + method_kwargs : dict, optional + A dictionary of keyword arguments to pass to the selected `method` function. + Default is None, which will use an empty dictionary. + + Returns + ------- + baselines : np.ndarray, shape (L, M, N) + An array of all of the baselines. + params : dict + A dictionary with the following items: + + * 'average_weights': numpy.ndarray, shape (M, N) + The weight array used to fit all of the baselines. + * 'average_alpha': numpy.ndarray, shape (M, N) + Only returned if `method` is 'aspls'. The + `alpha` array used to fit all of the baselines for the + :meth:`~Baseline2D.aspls`. + + Additional items depend on the output of the selected method. Every + other key will have a list of values, with each item corresponding to a + fit. + + Notes + ----- + If `method` is 'aspls', `collab_pls` will also calculate + the `alpha` array for the entire dataset in the same manner as the weights. + + References + ---------- + Chen, L., et al. Collaborative Penalized Least Squares for Background + Correction of Multiple Raman Spectra. Journal of Analytical Methods + in Chemistry, 2018, 2018. + + """ + dataset, baseline_func, _, method_kws, _ = self._setup_optimizer( + data, method, (whittaker, morphological, spline), method_kwargs, + True + ) + data_shape = dataset.shape + if len(data_shape) != 3: + raise ValueError(( + 'the input data must have a shape of (number of measurements, number of x points,' + f' number of y points), but instead has a shape of {data_shape}' + )) + method = method.lower() + # if using aspls or pspline_aspls, also need to calculate the alpha array + # for the entire dataset + calc_alpha = method in ('aspls', 'pspline_aspls') + + # step 1: calculate weights for the entire dataset + if average_dataset: + _, fit_params = baseline_func(np.mean(dataset, axis=0), **method_kws) + method_kws['weights'] = fit_params['weights'] + if calc_alpha: + method_kws['alpha'] = fit_params['alpha'] + else: + weights = np.empty(data_shape) + if calc_alpha: + alpha = np.empty(data_shape) + for i, entry in enumerate(dataset): + _, fit_params = baseline_func(entry, **method_kws) + weights[i] = fit_params['weights'] + if calc_alpha: + alpha[i] = fit_params['alpha'] + method_kws['weights'] = np.mean(weights, axis=0) + if calc_alpha: + method_kws['alpha'] = np.mean(alpha, axis=0) + + # step 2: use the dataset weights from step 1 (stored in method_kws['weights']) + # to fit each individual data entry; set tol to infinity so that only one + # iteration is done and new weights are not calculated + method_kws['tol'] = np.inf + baselines = np.empty(data_shape) + params = {'average_weights': method_kws['weights']} + if calc_alpha: + params['average_alpha'] = method_kws['alpha'] + if method == 'fabc': + # set weights as mask so it just fits the data + method_kws['weights_as_mask'] = True + + for i, entry in enumerate(dataset): + baselines[i], param = baseline_func(entry, **method_kws) + for key, value in param.items(): + if key in params: + params[key].append(value) + else: + params[key] = [value] + + return baselines, params + + @_Algorithm2D._register(skip_sorting=True) + def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, + constrained_fraction=0.01, constrained_weight=1e5, + estimation_poly_order=2, method_kwargs=None): + """ + Fits polynomials of different orders and uses the maximum values as the baseline. + + Each polynomial order fit is done both unconstrained and constrained at the + endpoints. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int] or None, optional + The two polynomial orders to use for fitting. If a single integer is given, + then will use the input value and one plus the input value. Default is None, + which will do a preliminary fit using a polynomial of order `estimation_poly_order` + and then select the appropriate polynomial orders according to [32]_. + method : {'modpoly', 'imodpoly'}, optional + The method to use for fitting each polynomial. Default is 'modpoly'. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + constrained_fraction : float or Sequence[float, float], optional + The fraction of points at the left and right edges to use for the + constrained fit. Default is 0.01. If `constrained_fraction` is a sequence, + the first item is the fraction for the left edge and the second is the + fraction for the right edge. + constrained_weight : float or Sequence[float, float], optional + The weighting to give to the endpoints. Higher values ensure that the + end points are fit, but can cause large fluctuations in the other sections + of the polynomial. Default is 1e5. If `constrained_weight` is a sequence, + the first item is the weight for the left edge and the second is the + weight for the right edge. + estimation_poly_order : int, optional + The polynomial order used for estimating the baseline-to-signal ratio + to select the appropriate polynomial orders if `poly_order` is None. + Default is 2. + method_kwargs : dict, optional + Additional keyword arguments to pass to + :meth:`~Baseline.modpoly` or :meth:`~Baseline.imodpoly`. These include + `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. + + Returns + ------- + numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'constrained_weights': numpy.ndarray, shape (M, N) + The weight array used for the endpoint-constrained fits. + * 'poly_order': numpy.ndarray, shape (2,) + An array of the two polynomial orders used for the fitting. + + References + ---------- + .. [32] Cao, A., et al. A robust method for automated background subtraction + of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, + 1199-1205. + + """ + y, baseline_func, _, method_kws, _ = self._setup_optimizer( + data, method, [polynomial], method_kwargs, False + ) + sort_weights = weights is not None + weight_array = _check_optional_array( + self._len, weights, check_finite=self._check_finite, ensure_1d=False, axis=slice(None) + ) + if poly_order is None: + poly_orders = _determine_polyorders( + y, estimation_poly_order, weight_array, baseline_func, **method_kws + ) + else: + poly_orders, scalar_poly_order = _check_scalar(poly_order, 2, True, dtype=int) + if scalar_poly_order: + poly_orders[1] += 1 # add 1 since they are initially equal if scalar input + + # use high weighting rather than Lagrange multipliers to constrain the points + # to better work with noisy data + weightings = _get_row_col_values(constrained_weight) + constrained_fractions = _get_row_col_values(constrained_fraction) + if np.any(constrained_fractions < 0) or np.any(constrained_fractions > 1): + raise ValueError('constrained_fraction must be between 0 and 1') + + # have to temporarily sort weights to match x- and y-ordering so that left and right edges + # are correct + if sort_weights: + weight_array = _sort_array2d(weight_array, self._sort_order) + + constrained_weights = weight_array.copy() + constrained_weights[:ceil(self._len[0] * constrained_fractions[0])] = weightings[0] + constrained_weights[:, :ceil(self._len[1] * constrained_fractions[2])] = weightings[2] + constrained_weights[ + self._len[0] - ceil(self._len[0] * constrained_fractions[1]): + ] = weightings[1] + constrained_weights[ + :, self._len[1] - ceil(self._len[1] * constrained_fractions[3]): + ] = weightings[3] + # and now change back to original ordering + if sort_weights: + weight_array = _sort_array2d(weight_array, self._inverted_order) + constrained_weights = _sort_array2d(constrained_weights, self._inverted_order) + + # TODO should make parameters available; a list with an item for each fit like collab_pls + # TODO could maybe just use itertools.permutations, but would want to know the order in + # which the parameters are used + baselines = np.empty((4, *self._len)) + baselines[0] = baseline_func( + data=y, poly_order=poly_orders[0], weights=weight_array, **method_kws + )[0] + baselines[1] = baseline_func( + data=y, poly_order=poly_orders[0], weights=constrained_weights, **method_kws + )[0] + baselines[2] = baseline_func( + data=y, poly_order=poly_orders[1], weights=weight_array, **method_kws + )[0] + baselines[3] = baseline_func( + data=y, poly_order=poly_orders[1], weights=constrained_weights, **method_kws + )[0] + + # TODO should the coefficients also be made available? Would need to get them from + # each of the fits + params = { + 'weights': weight_array, 'constrained_weights': constrained_weights, + 'poly_order': poly_orders + } + + return np.maximum.reduce(baselines), params + + +def _determine_polyorders(y, poly_order, weights, fit_function, **fit_kwargs): + """ + Selects the appropriate polynomial orders based on the baseline-to-signal ratio. + + Parameters + ---------- + y : numpy.ndarray + The array of y-values. + poly_order : int + The polynomial order for fitting. + weights : numpy.ndarray + The weight array for fitting. + fit_function : Callable + The function to use for the polynomial fit. + **fit_kwargs + Additional keyword arguments to pass to `fit_function`. + + Returns + ------- + orders : numpy.ndarray, shape (2,) + The two polynomial orders to use based on the baseline to signal + ratio according to the reference. + + References + ---------- + Cao, A., et al. A robust method for automated background subtraction + of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. + + """ + baseline = fit_function(y, poly_order=poly_order, weights=weights, **fit_kwargs)[0] + signal = y - baseline + baseline_to_signal = (baseline.max() - baseline.min()) / (signal.max() - signal.min()) + # Table 2 in reference # TODO in 2D does this need changed? + if baseline_to_signal < 0.2: + orders = (1, 2) + elif baseline_to_signal < 0.75: + orders = (2, 3) + elif baseline_to_signal < 8.5: + orders = (3, 4) + elif baseline_to_signal < 55: + orders = (4, 5) + elif baseline_to_signal < 240: + orders = (5, 6) + elif baseline_to_signal < 517: + orders = (6, 7) + else: + orders = (6, 8) # not a typo, use 6 and 8 rather than 7 and 8 + + return np.array(orders) diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py new file mode 100644 index 0000000..2303426 --- /dev/null +++ b/pybaselines/two_d/polynomial.py @@ -0,0 +1,801 @@ +# -*- coding: utf-8 -*- +"""Polynomial techniques for fitting baselines to experimental data. + +Created on April 16, 2023 +@author: Donald Erb + + +The function penalized_poly was adapted from MATLAB code from +https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction +(accessed March 18, 2021), which was licensed under the BSD-2-clause below. + +License: 2-clause BSD + +Copyright (c) 2012, Vincent Mazet +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the distribution + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +""" + +import numpy as np + +from .. import _weighting +from ._algorithm_setup import _Algorithm2D +from ..utils import _MIN_FLOAT, _convert_coef2d, relative_difference + + +class _Polynomial(_Algorithm2D): + """A base class for all polynomial algorithms.""" + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=None): + """ + Computes a polynomial that fits the baseline of the data. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the x and z values and return them in the params dictionary. + Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Notes + ----- + To only fit regions without peaks, supply a weight array with zero values + at the indices where peaks are located. + + """ + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross + ) + sqrt_w = np.sqrt(weight_array) + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + params = {'weights': weight_array} + if return_coef: + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=False, return_coef=False, max_cross=None): + """ + The modified polynomial (ModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 with N points. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [33]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [34]_. + mask_initial_peaks : bool, optional + If True, will mask any data where the initial baseline fit + the standard + deviation of the residual is less than measured data [35]_. Default is False. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the x and z values and return them in the params dictionary. + Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Notes + ----- + Algorithm originally developed in [34]_ and then slightly modified in [33]_. + + References + ---------- + .. [33] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [34] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [35] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True, + max_cross=max_cross + ) + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + if mask_initial_peaks: + # use baseline + deviation since without deviation, half of y should be above baseline + weight_array[baseline + np.std(y - baseline) < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + y = np.minimum(y0 if use_original else y, baseline) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=True, return_coef=False, + num_std=1., max_cross=None): + """ + The improved modofied polynomial (IModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [36]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [37]_. + mask_initial_peaks : bool, optional + If True (default), will mask any data where the initial baseline fit + + the standard deviation of the residual is less than measured data [38]_. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the x and z values and return them in the params dictionary. + Default is False, since the conversion takes time. + num_std : float, optional + The number of standard deviations to include when thresholding. Default + is 1. Must be greater or equal to 0. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `num_std` is less than 0. + + Notes + ----- + Algorithm originally developed in [38]_. + + References + ---------- + .. [36] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [37] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [38] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + if num_std < 0: + raise ValueError('num_std must be greater than or equal to 0') + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, + copy_weights=True, max_cross=max_cross + ) + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + deviation = np.std(y - baseline) + if mask_initial_peaks: + weight_array[baseline + deviation < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + y = np.minimum(y0 if use_original else y, baseline + num_std * deviation) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self.vandermonde @ coef + new_deviation = np.std(y - baseline) + # use new_deviation as dividing term in relative difference + calc_difference = relative_difference(new_deviation, deviation) + tol_history[i] = calc_difference + if calc_difference < tol: + break + deviation = new_deviation + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) + + return baseline, params + + # adapted from + # https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction; + # see license above + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + cost_function='asymmetric_truncated_quadratic', threshold=None, + alpha_factor=0.99, return_coef=False, max_cross=None): + """ + Fits a polynomial baseline using a non-quadratic cost function. + + The non-quadratic cost functions penalize residuals with larger values, + giving a more robust fit compared to normal least-squares. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + cost_function : str, optional + The non-quadratic cost function to minimize. Must indicate symmetry of the + method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or + 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. + Available methods, and their associated reference, are: + + * 'asymmetric_truncated_quadratic'[39]_ + * 'symmetric_truncated_quadratic'[39]_ + * 'asymmetric_huber'[39]_ + * 'symmetric_huber'[39]_ + * 'asymmetric_indec'[40]_ + * 'symmetric_indec'[40]_ + + threshold : float, optional + The threshold value for the loss method, where the function goes from + quadratic loss (such as used for least squares) to non-quadratic. For + symmetric loss methods, residual values with absolute value less than + threshold will have quadratic loss. For asymmetric loss methods, residual + values less than the threshold will have quadratic loss. Default is None, + which sets `threshold` to one-tenth of the standard deviation of the input + data. + alpha_factor : float, optional + A value between 0 and 1 that controls the value of the penalty. Default is + 0.99. Typically should not need to change this value. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the x and z values and return them in the params dictionary. + Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `alpha_factor` is not between 0 and 1. + + Notes + ----- + In baseline literature, this procedure is sometimes called "backcor". + + References + ---------- + .. [39] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + .. [40] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + """ + if not 0 < alpha_factor <= 1: + raise ValueError('alpha_factor must be between 0 and 1') + symmetric_loss, method = _identify_loss_method(cost_function) + loss_function = { + 'huber': _huber_loss, + 'truncated_quadratic': _truncated_quadratic_loss, + 'indec': _indec_loss + }[method] + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross + ) + if threshold is None: + threshold = np.std(y) / 10 + loss_kwargs = { + 'threshold': threshold, 'alpha_factor': alpha_factor, 'symmetric': symmetric_loss + } + + sqrt_w = np.sqrt(weight_array) + y = sqrt_w * y + + coef = pseudo_inverse @ y + baseline = self.vandermonde @ coef + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + coef = pseudo_inverse @ (y + loss_function(y - sqrt_w * baseline, **loss_kwargs)) + baseline = self.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) + ) + def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, + weights=None, eps=None, return_coef=False, max_cross=None): + """ + Approximates the baseline of the data using quantile regression. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for x and z. If a single value, will use that for both x and + z. Default is 2. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + tol : float, optional + The exit criteria. Default is 1e-6. For extreme quantiles (`quantile` < 0.01 + or `quantile` > 0.99), may need to use a lower value to get a good fit. + max_iter : int, optional + The maximum number of iterations. Default is 250. For extreme quantiles + (`quantile` < 0.01 or `quantile` > 0.99), may need to use a higher value to + ensure convergence. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the x and z values and return them in the params dictionary. + Default is False, since the conversion takes time. + max_cross: int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + `x z**2`, `x**2 z`, and `x**2 z**2` would all be set to 0. Default is None, which + does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `quantile` is not between 0 and 1. + + Notes + ----- + Application of quantile regression for baseline fitting ss described in [41]_. + + Performs quantile regression using iteratively reweighted least squares (IRLS) + as described in [42]_. + + References + ---------- + .. [41] Komsta, Ł. Comparison of Several Methods of Chromatographic + Baseline Removal with a New Approach Based on Quantile Regression. + Chromatographia, 2011, 73, 721-731. + .. [42] Schnabel, S., et al. Simultaneous estimation of quantile curves using + quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. + + """ + # TODO provide a way to estimate best poly_order based on AIC like in Komsta? could be + # useful for all polynomial methods; maybe could be an optimizer function + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1.') + + y, weight_array = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, max_cross=max_cross + ) + # estimate first iteration using least squares + sqrt_w = np.sqrt(weight_array) + coef = np.linalg.lstsq(self.vandermonde * sqrt_w[:, None], y * sqrt_w, None)[0] + baseline = self.vandermonde @ coef + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + sqrt_w = np.sqrt(_weighting._quantile(y, baseline, quantile, eps)) + coef = np.linalg.lstsq(self.vandermonde * sqrt_w[:, None], y * sqrt_w, None)[0] + baseline = self.vandermonde @ coef + # relative_difference(baseline_old, baseline, 1) gives nearly same result and + # the l2 norm is faster to calculate, so use that instead of l1 norm + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} + if return_coef: + params['coef'] = _convert_coef2d( + coef, self.poly_order[0], self.poly_order[1], self.x_domain, self.z_domain + ) + + return baseline, params + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _huber_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Huber non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the huber loss function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for huber is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * 2 * alpha * threshold * np.sign(residual) + ) + else: + mask = (residual < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * (2 * alpha * threshold - residual) + ) + return weights + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _truncated_quadratic_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Truncated-Quadratic non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the truncated quadratic function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for truncated quadratic is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + else: + mask = (residual < threshold) + return mask * residual * (2 * alpha - 1) - (~mask) * residual + + +def _indec_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Indec non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the Indec function, phi(x). + + References + ---------- + Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for indec is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + multiple = np.sign(residual) + else: + mask = (residual < threshold) + # multiple=1 is same as sign(residual) since residual is always > 0 + # for asymmetric case, but this allows not doing the sign calculation + multiple = 1 + weights = ( + mask * residual * (2 * alpha - 1) + - (~mask) * ( + residual + alpha * multiple * threshold**3 / np.maximum(2 * residual**2, _MIN_FLOAT) + ) + ) + return weights + + +def _identify_loss_method(loss_method): + """ + Identifies the symmetry for the given loss method. + + Parameters + ---------- + loss_method : str + The loss method to use. Should have the symmetry identifier as + the prefix. + + Returns + ------- + symmetric : bool + True if `loss_method` had 's_' or 'symmetric_' as the prefix, else False. + str + The input `loss_method` value without the first section that indicated + the symmetry. + + Raises + ------ + ValueError + Raised if the loss method does not have the correct form. + + """ + prefix, *split_method = loss_method.lower().split('_') + if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: + raise ValueError('must specify loss function symmetry by appending "a_" or "s_"') + if prefix in ('a', 'asymmetric'): + symmetric = False + else: + symmetric = True + return symmetric, '_'.join(split_method) diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py new file mode 100644 index 0000000..190a59f --- /dev/null +++ b/pybaselines/two_d/smooth.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +"""Smoothing-based techniques for fitting baselines to experimental data. + +Created on April 8, 2023 +@author: Donald Erb + +""" + +import numpy as np +from scipy.ndimage import gaussian_filter, median_filter + +from ._algorithm_setup import _Algorithm2D + + +class _Smooth(_Algorithm2D): + """A base class for all smoothing algorithms.""" + + @_Algorithm2D._register + def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=None, + **pad_kwargs): + """ + The noise-median method for baseline identification. + + Assumes the baseline can be considered as the median value within a moving + window, and the resulting baseline is then smoothed with a Gaussian kernel. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The index-based size to use for the median window on the rows and columns, + respectively. The total window size in each dimension will range from + [-half_window, ..., half_window] with size 2 * half_window + 1. Default is + None, which will use twice the output from :func:`.optimize_window`, + which is an okay starting value. + smooth_half_window : int, optional + The half window to use for smoothing. Default is None, which will use + the average of the values in `half_window`. + sigma : float, optional + The standard deviation of the smoothing Gaussian kernel. Default is None, + which will use (2 * `smooth_half_window` + 1) / 6. + **pad_kwargs + Additional keyword arguments to pass to :func:`.pad_edges2d` for padding + the edges of the data to prevent edge effects from convolution. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated and smoothed baseline. + dict + An empty dictionary, just to match the output of all other algorithms. + + References + ---------- + Friedrichs, M., A model-free algorithm for the removal of baseline + artifacts. J. Biomolecular NMR, 1995, 5, 147-153. + + """ + y, half_window = self._setup_smooth(data, half_window, False, 2, **pad_kwargs) + window_size = 2 * half_window + 1 + median = median_filter(y, window_size, mode='nearest') + if smooth_half_window is None: + smooth_window = np.mean(window_size) # truncate can only be a single value + else: + smooth_window = 2 * smooth_half_window + 1 + if sigma is None: + # the gaussian kernel will includes +- 3 sigma + sigma = smooth_window / 6 + + baseline = gaussian_filter(median, sigma, truncate=smooth_window) # TODO check truncate value + return baseline[half_window[0]:-half_window[0], half_window[1]:-half_window[1]], {} diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py new file mode 100644 index 0000000..76b63a1 --- /dev/null +++ b/pybaselines/two_d/spline.py @@ -0,0 +1,1008 @@ +# -*- coding: utf-8 -*- +"""Functions for fitting baselines using splines. + +Created on April 25, 2023 +@author: Donald Erb + +""" + +from functools import partial +from math import ceil +import warnings + +import numpy as np +from scipy.optimize import curve_fit + +from .. import _weighting +from ..utils import ParameterWarning, gaussian, relative_difference, _MIN_FLOAT +from ._algorithm_setup import _Algorithm2D +from ._whittaker_utils import PenalizedSystem2D +from .._compat import _HAS_NUMBA, jit + + +class _Spline(_Algorithm2D): + """A base class for all spline algorithms.""" + + @_Algorithm2D._register(sort_keys=('weights',)) + def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=3, + max_iter=50, tol=1e-3, weights=None, symmetric=False, num_bins=None): + """ + Considers the data as a mixture model composed of noise and peaks. + + Weights are iteratively assigned by calculating the probability each value in + the residual belongs to a normal distribution representing the noise. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Used to set the initial weights before performing + expectation-maximization. Default is 1e-2. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 3 (third order differential matrix). Typical values are 2 or 3. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1, and then + two iterations of reweighted least-squares are performed to provide starting + weights for the expectation-maximization of the mixture model. + symmetric : bool, optional + If False (default), the total mixture model will be composed of one normal + distribution for the noise and one uniform distribution for positive non-noise + residuals. If True, an additional uniform distribution will be added to the + mixture model for negative non-noise residuals. Only need to set `symmetric` + to True when peaks are both positive and negative. + num_bins : int, optional + The number of bins to use when transforming the residuals into a probability + density distribution. Default is None, which uses ``ceil(sqrt(M * N))``. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if p is not between 0 and 1. + + References + ---------- + de Rooi, J., et al. Mixture models for baseline estimation. Chemometric and + Intelligent Laboratory Systems, 2012, 117, 56-60. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + # scale y between -1 and 1 so that the residual fit is more numerically stable + y_domain = np.polynomial.polyutils.getdomain(y.ravel()) + y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) + + if weights is not None: + baseline = self.pspline.solve(y, weight_array) + else: + # perform 2 iterations: first is a least-squares fit and second is initial + # reweighted fit; 2 fits are needed to get weights to have a decent starting + # distribution for the expectation-maximization + if symmetric and not 0.2 < p < 0.8: + # p values far away from 0.5 with symmetric=True give bad initial weights + # for the expectation maximization + warnings.warn( + 'should use a p value closer to 0.5 when symmetric is True', + ParameterWarning, stacklevel=2 + ) + for _ in range(2): + baseline = self.pspline.solve(y, weight_array) + weight_array = _weighting._asls(y, baseline, p) + + # now perform the expectation-maximization + # TODO not sure if there is a better way to do this than transforming + # the residual into a histogram, fitting the histogram, and then assigning + # weights based on the bins; actual expectation-maximization uses log(probability) + # directly estimates sigma from that, and then calculates the percentages, maybe + # that would be faster/more stable? + if num_bins is None: + num_bins = ceil(np.sqrt(self._len[0] * self._len[1])) + + # uniform probability density distribution for positive residuals, constant + # from 0 to max(residual), and 0 for residuals < 0 + pos_uniform_pdf = np.empty(num_bins) + tol_history = np.empty(max_iter + 1) + residual = y - baseline + + # the 0.2 * std(residual) is an "okay" starting sigma estimate + fit_params = [0.5, np.log10(0.2 * np.std(residual))] + bounds = [[0, -np.inf], [1, np.inf]] + if symmetric: + fit_params.append(0.25) + bounds[0].append(0) + bounds[1].append(1) + # create a second uniform pdf for negative residual values + neg_uniform_pdf = np.empty(num_bins) + else: + neg_uniform_pdf = None + + # convert bounds to numpy array since curve_fit will use np.asarray each iteration + bounds = np.array(bounds) + for i in range(max_iter + 1): + residual_hist, bin_edges, bin_mapping = _mapped_histogram(residual, num_bins) + # average bin edges to get better x-values for fitting + bins = 0.5 * (bin_edges[:-1] + bin_edges[1:]) + pos_uniform_mask = bins < 0 + pos_uniform_pdf[~pos_uniform_mask] = 1 / max(abs(residual.max()), 1e-6) + pos_uniform_pdf[pos_uniform_mask] = 0 + if symmetric: + neg_uniform_mask = bins > 0 + neg_uniform_pdf[~neg_uniform_mask] = 1 / max(abs(residual.min()), 1e-6) + neg_uniform_pdf[neg_uniform_mask] = 0 + + fit_func = partial( + _mixture_pdf, pos_uniform=pos_uniform_pdf, neg_uniform=neg_uniform_pdf + ) + # use dogbox method since trf gives RuntimeWarnings from nans appearing + # somehow during optimization; trf is also prone to failure when symmetric=True + fit_params = curve_fit( + fit_func, bins, residual_hist, p0=fit_params, bounds=bounds, + check_finite=False, method='dogbox' + )[0] + sigma = 10**fit_params[1] + gaus_pdf = fit_params[0] * gaussian(bins, 1 / (sigma * np.sqrt(2 * np.pi)), 0, sigma) + posterior_prob = gaus_pdf / np.maximum(fit_func(bins, *fit_params), _MIN_FLOAT) + # need to clip since a bad initial start can erroneously set the sum of the fractions + # of each distribution to > 1 + np.clip(posterior_prob, 0, 1, out=posterior_prob) + new_weights = posterior_prob[bin_mapping].reshape(self._len) + + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + weight_array = new_weights + baseline = self.pspline.solve(y, weight_array) + residual = y - baseline + + # TODO could potentially return a BSpline object from scipy.interpolate + # using knots, spline degree, and coef, but would need to allow user to + # input the x-values for it to be useful + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + baseline = np.polynomial.polyutils.mapdomain(baseline, np.array([-1., 1.]), y_domain) + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, + diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): + """ + Iterative Reweighted Spline Quantile Regression (IRSQR). + + Fits the baseline using quantile regression with penalized splines. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 3 (third order differential matrix). Typical values are 2 or 3. + max_iter : int, optional + The max number of fit iterations. Default is 100. + tol : float, optional + The exit criteria. Default is 1e-6. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if quantile is not between 0 and 1. + + References + ---------- + Han, Q., et al. Iterative Reweighted Quantile Regression Using Augmented Lagrangian + Optimization for Baseline Correction. 2018 5th International Conference on Information + Science and Control Engineering (ICISCE), 2018, 280-284. + + """ + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + old_coef = np.zeros(self.pspline._num_bases[0] * self.pspline._num_bases[1]) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve(y, weight_array) + calc_difference = relative_difference(old_coef, self.pspline.coef) + tol_history[i] = calc_difference + if calc_difference < tol: + break + old_coef = self.pspline.coef + weight_array = _weighting._quantile(y, baseline, quantile, eps) + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the asymmetric least squares (AsLS) algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines along the rows and columns, respectively. If a + single value is given, both will use the same value. Default is 25. + spline_degree : int or Sequence[int, int], optional + The degree of the splines along the rows and columns, respectively. If a single + value is given, both will use the same value. Default is 3, which is a cubic spline. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 1 or 2. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + See Also + -------- + Baseline2D.asls + + References + ---------- + Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636. + + Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. + Leiden University Medical Centre Report, 2005, 1(1). + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve(y, weight_array) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, + spline_degree=3, max_iter=50, tol=1e-3, weights=None, diff_order=2): + """ + A penalized spline version of the IAsLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e1. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + lam_1 : float, optional + The smoothing parameter for the first derivative of the residual. Default is 1e-4. + num_knots : int, optional + The number of knots for the spline. Default is 100. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + diff_order : int, optional + The order of the differential matrix. Must be greater than 1. Default is 2 + (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2. + + See Also + -------- + Baseline2D.iasls + + References + ---------- + He, S., et al. Baseline correction for raman spectra using an improved + asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + elif np.less(diff_order, 2).any(): + raise ValueError('diff_order must be 2 or greater') + + if weights is None: + _, _, pseudo_inverse = self._setup_polynomial( + data, weights=None, poly_order=2, calc_vander=True, calc_pinv=True + ) + baseline = self.vandermonde @ (pseudo_inverse @ data.ravel()) + weights = _weighting._asls(data, baseline.reshape(self._len), p) + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + + # B.T @ P_1 @ B and B.T @ P_1 @ y + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) + p1_partial_penalty = self.pspline.basis.T @ penalized_system_1.penalty + + partial_rhs = p1_partial_penalty @ y.ravel() + self.pspline.add_penalty(p1_partial_penalty @ self.pspline.basis) + + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve(y, weight_array**2, rhs_extra=partial_rhs) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, + diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the airPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + Baseline2D.airpls + + References + ---------- + Zhang, Z.M., et al. Baseline correction using adaptive iteratively + reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam, copy_weights=True + ) + + y_l1_norm = np.abs(y).sum() + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + try: + output = self.pspline.solve(y, weight_array) + except np.linalg.LinAlgError: + warnings.warn( + ('error occurred during fitting, indicating that "tol"' + ' is too low, "max_iter" is too high, or "lam" is too high'), + ParameterWarning, stacklevel=2 + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + else: + baseline = output + + residual = y - baseline + neg_mask = residual < 0 + neg_residual = residual[neg_mask] + if len(neg_residual) < 2: + # exit if there are < 2 negative residuals since all points or all but one + # point would get a weight of 0, which fails the solver + warnings.warn( + ('almost all baseline points are below the data, indicating that "tol"' + ' is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + + residual_l1_norm = abs(neg_residual.sum()) + calc_difference = residual_l1_norm / y_l1_norm + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + # only use negative residual in exp to avoid exponential overflow warnings + # and accidently creating a weight of nan (inf * 0 = nan) + weight_array[neg_mask] = np.exp(i * neg_residual / residual_l1_norm) + weight_array[~neg_mask] = 0 + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the arPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + Baseline2D.arpls + + References + ---------- + Baek, S.J., et al. Baseline correction using asymmetrically reweighted + penalized least squares smoothing. Analyst, 2015, 140, 250-257. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve(y, weight_array) + new_weights = _weighting._arpls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, + max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the IarPLS algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + x_data : array-like, shape (N,), optional + The x-values of the measured data. Default is None, which will create an + array from -1 to 1 with N points. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + See Also + -------- + Baseline2D.iarpls + + References + ---------- + Ye, J., et al. Baseline correction method based on improved asymmetrically + reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, + 59, 10933-10943. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = self.pspline.solve(y, weight_array) + new_weights = _weighting._iarpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degree=3, + diff_order=2, max_iter=50, tol=1e-3, weights=None): + """ + A penalized spline version of the psalsa algorithm. + + Parameters + ---------- + data : array-like, shape (N,) + The y-values of the measured data, with N data points. Must not + contain missing data (NaN) or Inf. + lam : float, optional + The smoothing parameter. Larger values will create smoother baselines. + Default is 1e3. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 0.5. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`~Baseline2D.asls`. + num_knots : int, optional + The number of knots for the spline. Default is 25. + spline_degree : int, optional + The degree of the spline. Default is 3, which is a cubic spline. + diff_order : int, optional + The order of the differential matrix. Must be greater than 0. Default is 2 + (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + See Also + -------- + Baseline2D.psalsa + + References + ---------- + Oller-Moreno, S., et al. Adaptive Asymmetric Least Squares baseline estimation + for analytical instruments. 2014 IEEE 11th International Multi-Conference on + Systems, Signals, and Devices, 2014, 1-5. + + Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary + Reviews: Computational Statistics, 2010, 2(6), 637-653. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + y, weight_array = self._setup_spline( + data, weights, spline_degree, num_knots, True, diff_order, lam + ) + if k is None: + k = np.std(y) / 10 + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.pspline.solve(y, weight_array) + new_weights = _weighting._psalsa(y, baseline, p, k, self._len) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + +@jit(nopython=True, cache=True) +def _numba_mapped_histogram(data, num_bins, histogram): + """ + Creates a normalized histogram of the data and a mapping of the indices, using one pass. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The data to be made into a histogram. + num_bins : int + The number of bins for the histogram. + histogram : numpy.ndarray + An array of zeros that will be modified inplace into the histogram. + + Returns + ------- + bins : numpy.ndarray, shape (`num_bins` + 1) + The bin edges for the histogram. Follows numpy's implementation such that + each bin is inclusive on the left edge and exclusive on the right edge, except + for the last bin which is inclusive on both edges. + bin_mapping : numpy.ndarray, shape (N,) + An array of integers that maps each item in `data` to its index within `histogram`. + + Notes + ----- + `histogram` is modified inplace and converted to a probability density function + (total area = 1) after the counting. + + """ + num_data = data.shape[0] + bins = np.linspace(data.min(), data.max(), num_bins + 1) + bin_mapping = np.empty(num_data, dtype=np.intp) + bin_frequency = num_bins / (bins[-1] - bins[0]) + bin_0 = bins[0] + last_index = num_bins - 1 + # TODO this seems like it would work in parallel, but it instead slows down + for i in range(num_data): + index = int((data[i] - bin_0) * bin_frequency) + if index == num_bins: + histogram[last_index] += 1 + bin_mapping[i] = last_index + else: + histogram[index] += 1 + bin_mapping[i] = index + + # normalize histogram such that area=1 so that it is a probability density function + histogram /= (num_data * (bins[1] - bins[0])) + + return bins, bin_mapping + + +def _mapped_histogram(data, num_bins): + """ + Creates a histogram of the data and a mapping of the indices. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The data to be made into a histogram. + num_bins : int + The number of bins for the histogram. + + Returns + ------- + histogram : numpy.ndarray, shape (`num_bins`) + The histogram of the data, normalized so that its area is 1. + bins : numpy.ndarray, shape (`num_bins` + 1) + The bin edges for the histogram. Follows numpy's implementation such that + each bin is inclusive on the left edge and exclusive on the right edge, except + for the last bin which is inclusive on both edges. + bin_mapping : numpy.ndarray, shape (N,) + An array of integers that maps each item in `data` to its index within `histogram`. + + Notes + ----- + If numba is installed, the histogram and bin mapping can both be created in + one pass, which is faster. + + """ + if _HAS_NUMBA: + # create zeros array outside of numba function since numba's implementation + # of np.zeros is much slower than numpy's (https://github.com/numba/numba/issues/7259) + histogram = np.zeros(num_bins) + bins, bin_mapping = _numba_mapped_histogram(data.ravel(), num_bins, histogram) + else: + histogram, bins = np.histogram(data, num_bins, density=True) + # leave out last bin edge to account for extra index; leave out first + # bin edge since np.searchsorted finds indices where bin[i-1] <= val < bin[i] + # while the desired indices are bin[i] <= val < bin[i + 1] + bin_mapping = np.searchsorted(bins[1:-1], data, 'right') + + return histogram, bins, bin_mapping + + +def _mixture_pdf(x, n, sigma, n_2=0, pos_uniform=None, neg_uniform=None): + """ + The probability density function of a Gaussian and one or two uniform distributions. + + Parameters + ---------- + x : numpy.ndarray, shape (N,) + The x-values of the distribution. + n : float + The fraction of the distribution belonging to the Gaussian. + sigma : float + Log10 of the standard deviation of the Gaussian distribution. + n_2 : float, optional + If `neg_uniform` or `pos_uniform` is None, then `n_2` is just an unused input. + Otherwise, it is the fraction of the distribution belonging to the positive + uniform distribution. Default is 0. + pos_uniform : numpy.ndarray, shape (N,), optional + The array of the positive uniform distributtion. Default is None. + neg_uniform : numpy.ndarray, shape (N,), optional + The array of the negative uniform distribution. Default is None. + + Returns + ------- + numpy.ndarray + The total probability density function for the mixture model. + + Notes + ----- + Defining `sigma` as ``log10(actual sigma)`` allows not bounding `sigma` during + optimization and allows it to more easily fit different scales. + + References + ---------- + de Rooi, J., et al. Mixture models for baseline estimation. Chemometric and + Intelligent Laboratory Systems, 2012, 117, 56-60. + + """ + # no error handling for if both pos_uniform and neg_uniform are None since this + # is an internal function + if neg_uniform is None: + n1 = n + n2 = 1 - n + n3 = 0 + neg_uniform = 0 + elif pos_uniform is None: # never actually used, but nice to have for the future + n1 = n + n2 = 0 + n3 = 1 - n + pos_uniform = 0 + else: + n1 = n + n2 = n_2 + n3 = 1 - n - n_2 + + actual_sigma = 10**sigma + # the gaussian should be area-normalized, so set height accordingly + height = 1 / max(actual_sigma * np.sqrt(2 * np.pi), _MIN_FLOAT) + + return n1 * gaussian(x, height, 0, actual_sigma) + n2 * pos_uniform + n3 * neg_uniform diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py new file mode 100644 index 0000000..9239624 --- /dev/null +++ b/pybaselines/two_d/whittaker.py @@ -0,0 +1,791 @@ +# -*- coding: utf-8 -*- +"""Whittaker-smoothing-based techniques for fitting baselines to experimental data. + +Created on April 30, 2023 +@author: Donald Erb + +""" + +import warnings + +import numpy as np + +from .. import _weighting +from .. _compat import diags +from ._algorithm_setup import _Algorithm2D +from ._whittaker_utils import PenalizedSystem2D +from ..utils import _MIN_FLOAT, ParameterWarning, relative_difference +from .._validation import _check_optional_array + + +class _Whittaker(_Algorithm2D): + """A base class for all Whittaker-smoothing-based algorithms.""" + + @_Algorithm2D._register(sort_keys=('weights',)) + def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): + """ + Fits the baseline using asymmetric least squares (AsLS) fitting. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + References + ---------- + Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636. + + Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. + Leiden University Medical Centre Report, 2005, 1(1). + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, eigenvalues=eigenvalues + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.whittaker_system.solve(y, weight_array) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) + def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, + weights=None, diff_order=2): + """ + Fits the baseline using the improved asymmetric least squares (IAsLS) algorithm. + + The algorithm consideres both the first and second derivatives of the residual. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 1e-2. + lam_1 : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively, of the first + derivative of the residual. Default is 1e-4. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be set by fitting the data with a second order polynomial. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 1. + Default is 2 (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2. + + References + ---------- + He, S., et al. Baseline correction for raman spectra using an improved + asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + elif np.less(diff_order, 2).any(): + raise ValueError('diff_order must be 2 or greater') + + if weights is None: + _, _, pseudo_inverse = self._setup_polynomial( + data, weights=None, poly_order=2, calc_vander=True, calc_pinv=True + ) + baseline = self.vandermonde @ (pseudo_inverse @ data.ravel()) + weights = _weighting._asls(data, baseline.reshape(self._len), p) + + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + penalized_system_1 = PenalizedSystem2D(self._len, lam_1, diff_order=1) + + # (W.T @ W + P_1) @ y -> P_1 @ y + W.T @ W @ y + self.whittaker_system.add_penalty(penalized_system_1.penalty) + p1_y = penalized_system_1.penalty @ y + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.whittaker_system.solve(y, weight_array**2, rhs_extra=p1_y) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): + """ + Adaptive iteratively reweighted penalized least squares (airPLS) baseline. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e6. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Zhang, Z.M., et al. Baseline correction using adaptive iteratively + reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, copy_weights=True, eigenvalues=eigenvalues + + ) + y_l1_norm = np.abs(y).sum() + tol_history = np.empty(max_iter + 1) + # Have to have extensive error handling since the weights can all become + # very small due to the exp(i) term if too many iterations are performed; + # checking the negative residual length usually prevents any errors, but + # sometimes not so have to also catch any errors from the solvers + for i in range(1, max_iter + 2): + try: + output = self.whittaker_system.solve(y, weight_array) + except np.linalg.LinAlgError: + warnings.warn( + ('error occurred during fitting, indicating that "tol"' + ' is too low, "max_iter" is too high, or "lam" is too high'), + ParameterWarning, stacklevel=2 + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + else: + baseline = output + residual = y - baseline + neg_mask = residual < 0 + neg_residual = residual[neg_mask] + if neg_residual.size < 2: + # exit if there are < 2 negative residuals since all points or all but one + # point would get a weight of 0, which fails the solver + warnings.warn( + ('almost all baseline points are below the data, indicating that "tol"' + ' is too low and/or "max_iter" is too high'), ParameterWarning, stacklevel=2 + ) + i -= 1 # reduce i so that output tol_history indexing is correct + break + + residual_l1_norm = abs(neg_residual.sum()) + calc_difference = residual_l1_norm / y_l1_norm + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + # only use negative residual in exp to avoid exponential overflow warnings + # and accidently creating a weight of nan (inf * 0 = nan) + weight_array[neg_mask] = np.exp(i * neg_residual / residual_l1_norm) + weight_array[~neg_mask] = 0 + + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): + """ + Asymmetrically reweighted penalized least squares smoothing (arPLS). + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e3. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Baek, S.J., et al. Baseline correction using asymmetrically reweighted + penalized least squares smoothing. Analyst, 2015, 140, 250-257. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, eigenvalues=eigenvalues + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.whittaker_system.solve(y, weight_array) + new_weights = _weighting._arpls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True + ) + def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, diff_order=2): + """ + Doubly reweighted penalized least squares (drPLS) baseline. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + eta : float + A term for controlling the value of lam; should be between 0 and 1. + Low values will produce smoother baselines, while higher values will + more aggressively fit peaks. Default is 0.5. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then the initial weights + will be an array with size equal to N and all values set to 1. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 1. + Default is 2 (second order differential matrix). Typical values are 2 or 3. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `eta` is not between 0 and 1 or if `diff_order` is less than 2. + + References + ---------- + Xu, D. et al. Baseline correction method based on doubly reweighted + penalized least squares, Applied Optics, 2019, 58, 3913-3920. + + """ + if not 0 <= eta <= 1: + raise ValueError('eta must be between 0 and 1') + elif np.less(diff_order, 2).any(): + raise ValueError('diff_order must be 2 or greater') + + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + penalized_system_1 = PenalizedSystem2D(self._len, 1, diff_order=1) + # W + P_1 + (I - eta * W) @ P_n -> P_1 + P_n + W @ (I - eta * P_n) + partial_penalty = self.whittaker_system.penalty + penalized_system_1.penalty + partial_penalty_2 = -eta * self.whittaker_system.penalty + partial_penalty_2.setdiag(partial_penalty_2.diagonal() + 1) + weight_matrix = diags(weight_array, format='csr') + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = self.whittaker_system.direct_solve( + partial_penalty + weight_matrix @ partial_penalty_2, weight_array * y + ) + new_weights = _weighting._drpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + weight_matrix.setdiag(weight_array) + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, + eigenvalues=(10, 10)): + """ + Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Ye, J., et al. Baseline correction method based on improved asymmetrically + reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, + 59, 10933-10943. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, eigenvalues=eigenvalues + ) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = self.whittaker_system.solve(y, weight_array) + new_weights = _weighting._iarpls(y, baseline, i) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if not np.isfinite(calc_difference): + # catches nan, inf and -inf due to exp(i) being too high or if there + # are too few negative residuals; no way to catch both conditions before + # new_weights calculation since it is hard to estimate if + # (exp(i) / std) * residual will overflow; check calc_difference rather + # than checking new_weights since non-finite values rarely occur and + # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable + warnings.warn( + ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 + ) + break + elif calc_difference < tol: + break + weight_array = new_weights + + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + + params = {'weights': weight_array, 'tol_history': tol_history[:i]} + + return baseline, params + + @_Algorithm2D._register( + sort_keys=('weights', 'alpha'), reshape_keys=('weights', 'alpha'), reshape_baseline=True + ) + def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, + weights=None, alpha=None): + """ + Adaptive smoothness penalized least squares smoothing (asPLS). + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + alpha : array-like, shape (M, N), optional + An array of values that control the local value of `lam` to better + fit peak and non-peak regions. If None (default), then the initial values + will be an array with shape equal to (M, N) and all values set to 1. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'alpha': numpy.ndarray, shape (M, N) + The array of alpha values used for fitting the data in the final iteration. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Notes + ----- + The weighting uses an asymmetric coefficient (`k` in the asPLS paper) of 0.5 instead + of the 2 listed in the asPLS paper. pybaselines uses the factor of 0.5 since it + matches the results in Table 2 and Figure 5 of the asPLS paper closer than the + factor of 2 and fits noisy data much better. + + References + ---------- + Zhang, F., et al. Baseline correction for infrared spectra using + adaptive smoothness parameter penalized least squares method. + Spectroscopy Letters, 2020, 53(3), 222-233. + + """ + y, weight_array = self._setup_whittaker(data, lam, diff_order, weights) + alpha_array = _check_optional_array( + self._len, alpha, check_finite=self._check_finite, name='alpha', + ensure_1d=False, axis=slice(None) + ) + if self._sort_order is not None and alpha is not None: + alpha_array = alpha_array[self._sort_order] + + # use a sparse matrix to maintain sparsity after multiplication; implementation note: + # could skip making an alpha matrix and just use alpha_array[:, None] * penalty once + # the scipy sparse_arrays become standard -> will have to check if timing is affected + alpha_matrix = diags(alpha_array.ravel(), format='csr') + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + penalty = alpha_matrix @ self.whittaker_system.penalty + baseline = self.whittaker_system.solve(y, weight_array, penalty=penalty) + new_weights, residual = _weighting._aspls(y, baseline) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + # add _MIN_FLOAT so that no values are 0; otherwise, the sparsity of alpha @ penalty + # can change, which is inefficient + abs_d = np.abs(residual) + _MIN_FLOAT + alpha_array = abs_d / abs_d.max() + alpha_matrix.setdiag(alpha_array) + + params = { + 'weights': weight_array, 'alpha': alpha_array, 'tol_history': tol_history[:i + 1] + } + + return baseline, params + + @_Algorithm2D._register(sort_keys=('weights',)) + def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, + weights=None, eigenvalues=(10, 10)): + """ + Peaked Signal's Asymmetric Least Squares Algorithm (psalsa). + + Similar to the asymmetric least squares (AsLS) algorithm, but applies an + exponential decay weighting to values greater than the baseline to allow + using a higher `p` value to better fit noisy data. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter for the rows and columns, respectively. If a single + value is given, both will use the same value. Larger values will create smoother + baselines. Default is 1e5. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `p - 1` weight. Default is 0.5. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`~Baseline2D.asls`. + diff_order : int or Sequence[int, int], optional + The order of the differential matrix for the rows and columns, respectively. If + a single value is given, both will use the same value. Must be greater than 0. + Default is 2 (second order differential matrix). Typical values are 2 or 1. + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with shape equal to (M, N) and all values set to 1. + eigenvalues : int or Sequence[int, int] or None + The maximum number of eigenvalues for the rows and columns, respectively, to use + for eigendecomposition. Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. + Default is (10, 10). + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + Notes + ----- + The exit criteria for the original algorithm was to check whether the signs + of the residuals do not change between two iterations, but the comparison of + the l2 norms of the weight arrays between iterations is used instead to be + more comparable to other Whittaker-smoothing-based algorithms. + + References + ---------- + Oller-Moreno, S., et al. Adaptive Asymmetric Least Squares baseline estimation + for analytical instruments. 2014 IEEE 11th International Multi-Conference on + Systems, Signals, and Devices, 2014, 1-5. + + Biessy, G. Revisiting Whittaker-Henderson Smoothing. https://hal.science/hal-04124043 + (Preprint), 2023. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array = self._setup_whittaker( + data, lam, diff_order, weights, eigenvalues=eigenvalues + ) + if k is None: + k = np.std(y) / 10 + + shape = self._len if self.whittaker_system._using_svd else np.prod(self._len) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = self.whittaker_system.solve(y, weight_array) + new_weights = _weighting._psalsa(y, baseline, p, k, shape) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + if not self.whittaker_system._using_svd: + baseline = baseline.reshape(self._len) + weight_array = weight_array.reshape(self._len) + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + + return baseline, params diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 17fd2e7..cf944b9 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -16,7 +16,9 @@ from ._banded_utils import PenalizedSystem, difference_matrix as _difference_matrix from ._compat import jit from ._spline_utils import PSpline -from ._validation import _check_array, _check_scalar, _check_optional_array, _yx_arrays +from ._validation import ( + _check_array, _check_scalar, _check_optional_array, _get_row_col_values, _yx_arrays +) # the minimum positive float values such that a + _MIN_FLOAT != a @@ -83,8 +85,57 @@ def gaussian(x, height=1.0, center=0.0, sigma=1.0): numpy.ndarray The Gaussian distribution evaluated with x. + Raises + ------ + ValueError + Raised if `sigma` is not greater than 0. + + """ + if sigma <= 0: + raise ValueError('sigma must be greater than 0') + return height * np.exp(-0.5 * ((x - center)**2) / sigma**2) + + +def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0): + """ + Generates a Gaussian distribution based on height, center, and sigma. + + Parameters + ---------- + x : numpy.ndarray, shape (M, N) + The x-values at which to evaluate the distribution. + z : numpy.ndarray, shape (M, N) + The z-values at which to evaluate the distribution. + height : float, optional + The maximum height of the distribution. Default is 1.0. + center_x : float, optional + The center of the distribution in the x-axis. Default is 0.0. + sigma_x : float, optional + The standard deviation of the distribution in the x-axis. Default is 1.0. + center_z : float, optional + The center of the distribution in the z-axis. Default is 0.0. + sigma_z : float, optional + The standard deviation of the distribution in the z-axis. Default is 1.0. + + Returns + ------- + numpy.ndarray, shape (M, N) + The Gaussian distribution evaluated with x and z. + + Raises + ------ + ValueError + Raised if the input `x` or `z` are not two dimensional. + + Notes + ----- + The input `x` and `z` should be two dimensional arrays, which can be gotten + from their one dimensional counterparts by using :func:`numpy.meshgrid`. + """ - return height * np.exp(-0.5 * ((x - center)**2) / max(sigma, _MIN_FLOAT)**2) + if x.ndim != 2 or z.ndim != 2: + raise ValueError('x and z should be two dimensional') + return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z) def gaussian_kernel(window_size, sigma=1.0): @@ -278,6 +329,196 @@ def pad_edges(data, pad_length, mode='extrapolate', return padded_data +def _extrapolate2d(y, total_padding, extrapolate_window=None): + """ + Extrapolates each edge of two dimensional data. + + Corners are calculated by averaging linear fits of the extended data. + + Parameters + ---------- + y : numpy.ndarray + _description_ + total_padding : Sequence[int, int, int, int] + The padding for the top, bottom, left, and right. The padding of top and + bottom are assumed to be equal, as are the left and right. + extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional + The number of values to use for linear fitting on the top, bottom, left, and right + edges. Default is None, which will set the extrapolate window size equal + to `total_padding`. + + Returns + ------- + output : numpy.ndarray + The data with padding + + Raises + ------ + NotImplementedError + Raised if any value in `total_padding` is zero. + ValueError + Raised if any extrapolation window is less than 1. + + Notes + ----- + Uses the Moore-Penrose pseudo-inverse to speed up the calculation of the linear fits + for each edge. Using the Vandermonde with `numpy.linalg.lstsq` would also work but is + a little slower. + + """ + if np.equal(total_padding, 0).any(): + raise NotImplementedError('pad length of 0 is not supported in 2D') + elif np.less(total_padding, 0).any(): + raise ValueError('pad length must be greater or equal to 0') + + if extrapolate_window is None: + extrapolate_windows = total_padding + else: + extrapolate_windows = _get_row_col_values(extrapolate_window).reshape((2, 2)) + + if np.less_equal(extrapolate_windows, 0).any(): + raise ValueError('extrapolate_window must be greater than 0') + # pad length for left and right or top and bottom should be equal, so ignore the repeats + total_padding = [total_padding[0][0], total_padding[1][0]] + + output = np.empty( + (y.shape[0] + total_padding[0] * 2, y.shape[1] + total_padding[1] * 2) + ) + output[total_padding[0]:-total_padding[0], total_padding[1]:-total_padding[1]] = y + + x = np.arange(y.shape[0] + 2 * total_padding[0]) + z = np.arange(y.shape[1] + 2 * total_padding[1]) + + vander_x = np.polynomial.polynomial.polyvander(x, 1) + vander_z = np.polynomial.polynomial.polyvander(z, 1) + pinv_top = np.linalg.pinv( + vander_x[total_padding[0]:-total_padding[0]][:extrapolate_windows[0][0]] + ) + pinv_bottom = np.linalg.pinv( + vander_x[total_padding[0]:-total_padding[0]][-extrapolate_windows[0][1]:] + ) + pinv_left = np.linalg.pinv( + vander_z[total_padding[1]:-total_padding[1]][:extrapolate_windows[1][0]] + ) + pinv_right = np.linalg.pinv( + vander_z[total_padding[1]:-total_padding[1]][-extrapolate_windows[1][1]:] + ) + + top = vander_x[:total_padding[0]] @ (pinv_top @ y[:extrapolate_windows[0][0]]) + bottom = vander_x[-total_padding[0]:] @ (pinv_bottom @ y[-extrapolate_windows[0][1]:]) + + output[:total_padding[0], total_padding[1]:-total_padding[1]] = top + output[-total_padding[0]:, total_padding[1]:-total_padding[1]] = bottom + + left = vander_z[:total_padding[1]] @ (pinv_left @ y[:, :extrapolate_windows[1][0]].T) + right = vander_z[-total_padding[1]:] @ (pinv_right @ y[:, -extrapolate_windows[1][1]:].T) + + output[total_padding[0]:-total_padding[0], :total_padding[1]] = left.T + output[total_padding[0]:-total_padding[0], -total_padding[1]:] = right.T + + # now fill the corners by averaging the extensions of the corners + top_left = vander_z[:total_padding[1]] @ ( + pinv_left @ output[ + :total_padding[0], total_padding[1]:-total_padding[1] + ][:, :extrapolate_windows[1][0]].T + ) + top_right = vander_z[-total_padding[1]:] @ ( + pinv_right @ output[ + :total_padding[0], total_padding[1]:-total_padding[1] + ][:, -extrapolate_windows[1][1]:].T + ) + + bottom_left = vander_z[:total_padding[1]] @ ( + pinv_left @ output[ + -total_padding[0]:, total_padding[1]:-total_padding[1] + ][:, :extrapolate_windows[1][0]].T + ) + bottom_right = vander_z[-total_padding[1]:] @ ( + pinv_right @ output[ + -total_padding[0]:, total_padding[1]:-total_padding[1] + ][:, -extrapolate_windows[1][1]:].T + ) + + left_top = vander_x[:total_padding[0]] @ ( + pinv_top @ output[ + total_padding[0]:-total_padding[0], :total_padding[1] + ][:extrapolate_windows[0][0]] + ) + left_bottom = vander_x[-total_padding[0]:] @ ( + pinv_bottom @ output[ + total_padding[0]:-total_padding[0], :total_padding[1]: + ][-extrapolate_windows[0][1]:] + ) + + right_top = vander_x[:total_padding[0]] @ ( + pinv_top @ output[ + total_padding[0]:-total_padding[0], -total_padding[1]: + ][:extrapolate_windows[0][0]] + ) + right_bottom = vander_x[-total_padding[0]:] @ ( + pinv_bottom @ output[ + total_padding[0]:-total_padding[0], -total_padding[1]: + ][-extrapolate_windows[0][1]:] + ) + + output[:total_padding[0], :total_padding[1]] = 0.5 * (top_left.T + left_top) + output[:total_padding[0], -total_padding[1]:] = 0.5 * (top_right.T + right_top) + output[-total_padding[0]:, :total_padding[1]] = 0.5 * (bottom_left.T + left_bottom) + output[-total_padding[0]:, -total_padding[1]:] = 0.5 * (bottom_right.T + right_bottom) + + return output + + +def pad_edges2d(data, pad_length, mode='edge', extrapolate_window=None, **pad_kwargs): + """ + Adds left, right, top, and bottom edges to the data. + + Parameters + ---------- + data : array-like, shape (M, N) + The 2D array of the data. + pad_length : int or Sequence[int, int] + The number of points to add to the top, bottom, left, and right edges. If a single + value is given, all edges have the same padding. If a sequence of two values is + given, the first value will be the padding on the top and bottom (rows), and the second + value will pad the left and right (columns). + mode : str or Callable, optional + The method for padding. Default is 'edge'. Any method other than + 'extrapolate' will use :func:`numpy.pad`. + extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional + The number of values to use for linear fitting on the top, bottom, left, and right + edges. Default is None, which will set the extrapolate window size equal + to `pad_length`. + **pad_kwargs + Any keyword arguments to pass to :func:`numpy.pad`, which will be used if `mode` + is not 'extrapolate'. + + Returns + ------- + padded_data : numpy.ndarray + The data with padding on the top, bottom, left, and right edges. + + Notes + ----- + If mode is 'extrapolate', then each edge will be extended by linear fits along each + row and column, and the corners are calculated by averaging the linear sections. + + """ + y = np.asarray(data) + if y.ndim != 2: + raise ValueError('input data must be two dimensional') + total_padding = _get_row_col_values(pad_length).reshape((2, 2)) + + if isinstance(mode, str): + mode = mode.lower() + if mode == 'extrapolate': + output = _extrapolate2d(y, total_padding, extrapolate_window) + else: + output = np.pad(data, total_padding, mode=mode, **pad_kwargs) + + return output + + def padded_convolve(data, kernel, mode='reflect', **pad_kwargs): """ Pads data before convolving to reduce edge effects. @@ -337,6 +578,48 @@ def _interp_inplace(x, y, y_start, y_end): return y +def _poly_transform_matrix(num_coefficients, original_domain): + """ + Creates the matrix that transforms polynomial coefficents from one domain to another. + + The polynomial coefficient array `d` computed with `v` can be transformed to the + coefficient array `c` computed with `x` where ``v = scale * x + offset`` by applying + ``c = T @ d``, where `T` is the transformation matrix. + + Parameters + ---------- + num_coefficients : int + The number of polynomial coefficients, ie. the polynomial degree + 1. + original_domain : Sequence[float, float] + The domain, [min(x), max(x)], of the original data used for fitting. + + Returns + ------- + transformation : numpy.ndarray, shape (`num_coefficients`, `num_coefficients`) + The transformation matrix to convert domains. + + Notes + ----- + The calculation of the transformation matrix is based on the math from + https://stackoverflow.com/questions/141422/how-can-a-transform-a-polynomial-to-another-coordinate-system#comment57358951_142436. + + This function assumes the original coefficients were computed with the domain [-1, 1]. + + """ + offset, scale = np.polynomial.polyutils.mapparms(np.array([-1., 1.]), original_domain) + transformation = np.zeros((num_coefficients, num_coefficients)) + skip_offset = np.equal(offset, 0) # 0 raised to negative powers causes nan + for i in range(num_coefficients): + for j in range(num_coefficients): + if skip_offset: + if j == i: + transformation[i, j] = binom(j, i) * (scale)**(-j) + else: + transformation[i, j] = binom(j, i) * (scale)**(-j) * (-offset)**(j - i) + + return transformation + + def _convert_coef(coef, original_domain): """ Scales the polynomial coefficients back to the original domain of the data. @@ -348,41 +631,71 @@ def _convert_coef(coef, original_domain): Parameters ---------- - coef : array-like + coef : numpy.ndarray, shape (a,) The array of coefficients for the polynomial. Should increase in order, for example (c0, c1, c2) from `y = c0 + c1 * x + c2 * x**2`. - original_domain : array-like, shape (2,) + original_domain : Sequence[float, float] The domain, [min(x), max(x)], of the original data used for fitting. Returns ------- - numpy.ndarray + numpy.ndarray, shape (a,) The array of coefficients scaled for the original domain. Notes ----- - Based on https://stackoverflow.com/questions/141422/how-can-a-transform-a-polynomial-to-another-coordinate-system#comment57358951_142436. - Could slightly reduce computation time by computing offset and scale once within the _Algorithm object, but doing it this way with `original_domain` is backwards compatible and this function is probably not called enough to justify the change. """ - offset, scale = np.polynomial.polyutils.mapparms(np.array([-1, 1]), original_domain) - num_coefficients = len(coef) - transformation = np.zeros((num_coefficients, num_coefficients)) - skip_offset = np.equal(offset, 0) # 0 raised to negative powers causes nan - for i in range(num_coefficients): - for j in range(num_coefficients): - if skip_offset: - if j == i: - transformation[i, j] = binom(j, i) * (scale)**(-j) - else: - transformation[i, j] = binom(j, i) * (scale)**(-j) * (-offset)**(j - i) - + transformation = _poly_transform_matrix(coef.shape[0], original_domain) return transformation @ coef +def _convert_coef2d(coef, poly_degree_x, poly_degree_z, original_x_domain, original_z_domain): + """ + Scales the polynomial coefficients back to the original domain of the data. + + For fitting, the x-values and z-values are scaled from their original domain, + [min(x), max(x)] and [min(z), max(z)], to [-1, 1] in order to improve the numerical + stability of fitting. This function rescales the retrieved polynomial coefficients + for the fit x-values and z-values back to their original domains. + + Parameters + ---------- + coef : numpy.ndarray, shape (``a * b``,) + The 1d array of coefficients for the polynomial. Should increase in + order. The shape should be (``a * b``,), where `a` is the polynomial degree + 1 for + the x-values and `b` is the polynomial degree + 1 for the z-values. + poly_degree_x : int + The polynomial degree for the x-values + poly_degree_z : int + The polynomial degree for the z-values + original_x_domain : Sequence[float, float] + The domain, [min(x), max(x)], of the original x-values used for fitting. + original_z_domain : Sequence[float, float] + The domain, [min(z), max(z)], of the original z-values used for fitting. + + Returns + ------- + numpy.ndarray, shape (a, b) + The 2D array of coefficients scaled for the original domains. + + Notes + ----- + Reshapes the coefficient array into the correct shape for use with + :func:`numpy.polynomial.polynomial.polyval2d`. + + """ + x_order = poly_degree_x + 1 + z_order = poly_degree_z + 1 + transformation_x = _poly_transform_matrix(x_order, original_x_domain) + transformation_z = _poly_transform_matrix(z_order, original_z_domain) + + return transformation_x @ coef.reshape((x_order, z_order)) @ transformation_z.T + + def difference_matrix(data_size, diff_order=2, diff_format=None): """ Creates an n-order finite-difference matrix. @@ -399,7 +712,7 @@ def difference_matrix(data_size, diff_order=2, diff_format=None): Returns ------- - diff_matrix : scipy.sparse.base.spmatrix + diff_matrix : scipy.sparse.spmatrix or scipy.sparse._sparray The sparse difference matrix. Raises @@ -434,8 +747,8 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, Parameters ---------- - data : array-like, shape (N,) - The measured data values. + data : array-like + The measured data values. Can be one or two dimensional. increment : int, optional The step size for iterating half windows. Default is 1. max_hits : int, optional @@ -453,8 +766,10 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, Returns ------- - half_window : int - The optimized half window size. + half_window : int or numpy.ndarray[int, int] + The optimized half window size(s). If `data` is one dimensional, the + output is a single integer, and if `data` is two dimensional, the output + is an array of two integers. Notes ----- @@ -469,16 +784,18 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, """ y = np.asarray(data) if max_half_window is None: - max_half_window = (y.shape[0] - 1) // 2 + max_half_window = (y.shape[-1] - 1) // 2 if min_half_window is None: min_half_window = 1 + y_dims = y.ndim # TODO would it be better to allow padding the data? - opening = grey_opening(y, [2 * min_half_window + 1]) + opening = grey_opening(y, [2 * min_half_window + 1] * y_dims) hits = 0 + half_window = 1 # in case min_half_window is set incorrectly best_half_window = min_half_window for half_window in range(min_half_window + increment, max_half_window, increment): - new_opening = grey_opening(y, [half_window * 2 + 1]) + new_opening = grey_opening(y, [half_window * 2 + 1] * y_dims) if relative_difference(opening, new_opening) < window_tol: if hits == 0: # keep just the first window that fits tolerance @@ -491,7 +808,11 @@ def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6, hits = 0 opening = new_opening - return max(half_window, 1) # ensure half window is at least 1 + if y_dims == 2: + output = np.maximum([half_window, half_window], [1, 1]) + else: + output = max(half_window, 1) # ensure half window is at least 1 + return output def _inverted_sort(sort_order): @@ -528,6 +849,121 @@ def _inverted_sort(sort_order): return inverted_order +def _determine_sorts(data): + """ + Provides the arrays for sorting and inverting sorting, if needed. + + Parameters + ---------- + data : numpy.ndarray, shape (N,) + The array to potentially sort. + + Returns + ------- + output : tuple(numpy.ndarray, numpy.ndarray) or tuple(None, None) + A tuple of the index array for sorting the input array and the array + that inverts that sorting. If the input array is already sorted, then + the output will be (None, None). + + """ + sort_order = data.argsort(kind='mergesort') + skip_sorting = (sort_order[1:] > sort_order[:-1]).all() + if skip_sorting: + output = (None, None) + else: + output = (sort_order, _inverted_sort(sort_order)) + + return output + + +def _sort_array2d(array, sort_order=None): + """ + Sorts the input 2D array only if given a non-None sorting order. + + Parameters + ---------- + array : numpy.ndarray + The array to sort. Must be two or three dimensional. + sort_order : numpy.ndarray, optional + The array(s) defining the sort order for the input array. Default is None, which + will not sort the input. + + Returns + ------- + output : numpy.ndarray + The input array after optionally sorting. + + Notes + ----- + For all inputs, assumes the last 2 axes correspond to the data that needs sorted. + + Raises + ------ + ValueError + Raised if the input array is not two or three dimensional. + + """ + if sort_order is None: + output = array + else: + n_dims = array.ndim + if n_dims == 2: + output = array[sort_order] + elif n_dims == 3: + if isinstance(sort_order, tuple): + if sort_order[0] is Ellipsis: + output = array[sort_order] + else: + output = array[:, sort_order[0], sort_order[1]] + else: + output = array[:, sort_order, :] + else: + raise ValueError('too many dimensions to sort the data') + + return output + + +def _sort_array(array, sort_order=None): + """ + Sorts the input array only if given a non-None sorting order. + + Parameters + ---------- + array : numpy.ndarray + The array to sort. + sort_order : numpy.ndarray, optional + The array defining the sort order for the input array. Default is None, which + will not sort the input. + + Returns + ------- + output : numpy.ndarray + The input array after optionally sorting. + + Notes + ----- + For all inputs, assumes the last axis corresponds to the data that needs sorted. + + Raises + ------ + ValueError + Raised if the input array has more than two dimensions. + + """ + if sort_order is None: + output = array + else: + n_dims = array.ndim + if n_dims == 1: + output = array[sort_order] + elif n_dims == 2: + output = array[:, sort_order] + else: + raise ValueError('too many dimensions to sort the data') + + return output + + def whittaker_smooth(data, lam=1e6, diff_order=2, weights=None, check_finite=True): """ Smooths the input data using Whittaker smoothing. diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 438afbb..b5fa7d2 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -184,7 +184,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, d1_y = lambda_1 * d1_y tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - weight_squared = weight_array * weight_array + weight_squared = weight_array**2 baseline = self.whittaker_system.solve( self.whittaker_system.add_diagonal(weight_squared), weight_squared * y + d1_y, overwrite_b=True @@ -264,7 +264,7 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non warnings.warn( ('error occurred during fitting, indicating that "tol"' ' is too low, "max_iter" is too high, or "lam" is too high'), - ParameterWarning + ParameterWarning, stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -278,7 +278,8 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non # point would get a weight of 0, which fails the solver warnings.warn( ('almost all baseline points are below the data, indicating that "tol"' - ' is too low and/or "max_iter" is too high'), ParameterWarning + ' is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) i -= 1 # reduce i so that output tol_history indexing is correct break @@ -454,7 +455,8 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -530,7 +532,8 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non # checking a scalar is faster; cannot use np.errstate since it is not 100% reliable warnings.warn( ('nan and/or +/- inf occurred in weighting calculation, likely meaning ' - '"tol" is too low and/or "max_iter" is too high'), ParameterWarning + '"tol" is too low and/or "max_iter" is too high'), ParameterWarning, + stacklevel=2 ) break elif calc_difference < tol: @@ -559,7 +562,7 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -654,7 +657,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + Default is 1e5. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline @@ -664,7 +667,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -755,7 +758,7 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -1203,7 +1206,7 @@ def aspls(data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, weights=None, The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. max_iter : int, optional - The max number of fit iterations. Default is 50. + The max number of fit iterations. Default is 100. tol : float, optional The exit criteria. Default is 1e-3. weights : array-like, shape (N,), optional @@ -1272,7 +1275,7 @@ def psalsa(data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. - Default is 1e6. + Default is 1e5. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater than the baseline will be given `p` weight, and values less than the baseline @@ -1282,7 +1285,7 @@ def psalsa(data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -1356,7 +1359,7 @@ def derpsalsa(data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, tol=1e-3 values greater than the data. Should be approximately the height at which a value could be considered a peak. Default is None, which sets `k` to one-tenth of the standard deviation of the input data. A large k value - will produce similar results to :meth:`.asls`. + will produce similar results to :meth:`~Baseline.asls`. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. diff --git a/pyproject.toml b/pyproject.toml index 7b4d62a..64601ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,89 @@ [build-system] -# setuptools v42.0.0 was first version to allow multiple license files -# (the license_files metadata field in setup.cfg). Also covers pep-517 -# and pep-518 since support was added to setuptools in v40.8.0. -requires = ["setuptools>=42", "wheel"] +# setuptools v64.0.0 was first version to allow creating editable installs with only pyproject.toml +requires = ["setuptools>=64"] build-backend = "setuptools.build_meta" +[project] +name = "pybaselines" +version = "1.0.0" +authors = [ + {name = "Donald Erb", email = "donnie.erb@gmail.com"}, +] +description = "A library of algorithms for the baseline correction of experimental data." +readme = "README.rst" +license = {file = "LICENSE.txt"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Chemistry", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Physics", +] +keywords = [ + "materials characterization", + "materials science", + "baseline", + "background", + "baseline correction", + "baseline subtraction", + "chemistry", + "spectroscopy", + "raman", +] +requires-python = ">=3.8" +dependencies = [ + "numpy>=1.20", # lowest version to allow dtype for np.concatenate + "scipy>=1.5", # lowest versions supported for python 3.8 +] + +[project.urls] +Homepage = "https://github.com/derb12/pybaselines" +Documentation = "https://pybaselines.readthedocs.io" + +[project.optional-dependencies] +full = [ + "pentapy>=1.0", # first version with PTRANS-II solver and MIT license + "numba>=0.49", # first to allow usage with python 3.8 +] +test = [ + "pytest", + "ruff", +] +docs = [ + "sphinx", + "sphinx-rtd-theme", + "sphinx-autoapi", + "sphinx-gallery", + "matplotlib", + "numpydoc", +] +release = [ + "build", + "bump-my-version", + "twine", +] +dev = ["pybaselines[full, docs, test, release]"] + +[tool.setuptools] +# TODO license-files usage may change in the future once PEP 639 is accepted +license-files = [ + "LICENSE.txt", + "LICENSES_bundled.txt", +] + +[tool.setuptools.packages.find] +include = ["pybaselines", "pybaselines.*"] + [tool.isort] skip = "pybaselines/__init__.py" skip_glob = ["docs/*"] @@ -15,3 +94,77 @@ multi_line_output = 5 src_paths = ["pybaselines", "tests"] # example_helpers are locally used in doc examples known_local_folder = ["example_helpers"] + +[tool.ruff] +exclude = ["docs/*"] +line-length = 100 +fix = false + +[tool.ruff.lint] +preview = true # for using experimental rules +select = [ + "B", # flake8-bugbear + "D", + "E", # pycodestyle errors + "F", # pyflakes + #"I", # isort + "W", # pycodestyle warnings +] +ignore = [ + "D401", # D401 first line should be in imperative mood; try rephrasing + "E731", # E731 do not assign a lambda expression, use a def +] +task-tags = ["TODO"] + +[tool.ruff.lint.pycodestyle] +ignore-overlong-task-comments = true + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = [ + "F401", # F401: module imported but unused + "D205", # D205: 1 blank line required between summary line and description + +] +"examples/*" = [ + "B007", # B007: Loop control variable `name` not used within loop body; want to be explicit in examples + "D205", # D205: 1 blank line required between summary line and description + "D400", # D400: first line should end with a period +] +"tests/*" = [ + "F841", # F841: Local variable 'name' is assigned to but never used; want to be explicit within tests +] + + +[tool.bumpversion] +current_version = "1.0.0" +commit = false +tag = false +message = "Bump version: {current_version} -> {new_version}" + +[[tool.bumpversion.files]] +filename = "pyproject.toml" +search = "version = \"{current_version}\"" +replace = "version = \"{new_version}\"" + +[[tool.bumpversion.files]] +filename = "pybaselines/__init__.py" +search = "__version__ = '{current_version}'" +replace = "__version__ = '{new_version}'" + +[[tool.bumpversion.files]] +filename = "docs/conf.py" +search = "version = '{current_version}'" +replace = "version = '{new_version}'" + +[[tool.bumpversion.files]] +filename = "CITATION.cff" +search = "version: {current_version}" +replace = "version: {new_version}" + +[[tool.bumpversion.files]] +filename = "docs/citing.rst" +search = "version = {{{current_version}}}" +replace = "version = {{{new_version}}}" diff --git a/requirements/README.rst b/requirements/README.rst new file mode 100644 index 0000000..350a77c --- /dev/null +++ b/requirements/README.rst @@ -0,0 +1,19 @@ +=================== +Pinned Requirements +=================== + +The requirements in this folder are pinned to specific versions to allow recreating +a specific build. This is useful in cases such as building documentation on readthedocs +or for debugging since this specific build is known to work on both Windows and Linux +with python 3.11. + +If you only want to install all of the development dependencies for pybaselines, it is +recommended to instead use: + +.. code-block:: console + + pip install pybaselines[dev] + +or clone the repository by following the +`installation guide `_ +in the documentation. diff --git a/requirements/requirements-development.txt b/requirements/requirements-development.txt index 982c9e1..083d85f 100644 --- a/requirements/requirements-development.txt +++ b/requirements/requirements-development.txt @@ -2,14 +2,12 @@ -r requirements-documentation.txt # for linting -flake8==6.0.0 -flake8-comprehensions==3.7.0 -flake8-docstrings==1.6.0 +ruff==0.2.1 # for testing -pytest==6.2.5 +pytest==8.0.0 # for creating releases -bump2version==1.0.1 -twine==3.6.0 -wheel==0.37.0 +bump-my-version==0.17.4 +twine==5.0.0 +build==1.0.3 diff --git a/requirements/requirements-documentation.txt b/requirements/requirements-documentation.txt index a7df4af..950fe61 100644 --- a/requirements/requirements-documentation.txt +++ b/requirements/requirements-documentation.txt @@ -1,11 +1,10 @@ -r requirements.txt -sphinx==4.3.1 -sphinx-rtd-theme==1.0.0 -# pin docutils to v0.17.1 since v0.18 not yet compatible with Sphinx -docutils==0.17.1 -sphinx-autoapi==1.8.4 -sphinx-gallery==0.10.1 -matplotlib==3.3.3 -pentapy==1.1.2 -numba==0.54.1 +sphinx==7.2.6 +sphinx-rtd-theme==2.0.0 +sphinx-autoapi==3.0.0 +sphinx-gallery==0.15.0 +matplotlib==3.8.2 +pentapy==1.2.0 +numba==0.59.0 +numpydoc==1.6.0 \ No newline at end of file diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 6b9bd11..ec114b4 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,2 @@ -numpy==1.20.3 -scipy==1.7.3 +numpy==1.26.4 +scipy==1.12.0 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index b9f72cd..0000000 --- a/setup.cfg +++ /dev/null @@ -1,81 +0,0 @@ -[metadata] -name = pybaselines -version = 1.0.0 -author = Donald Erb -author_email = donnie.erb@gmail.com -description = A library of algorithms for the baseline correction of experimental data. -long_description = file: README.rst -long_description_content_type = text/x-rst -license = BSD-3-Clause -license_files = - LICENSE.txt - LICENSES_bundled.txt -classifiers = - Development Status :: 5 - Production/Stable - Intended Audience :: Science/Research - Intended Audience :: Developers - License :: OSI Approved :: BSD License - Operating System :: OS Independent - Programming Language :: Python :: 3 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Programming Language :: Python :: 3.11 - Programming Language :: Python :: 3.12 - Topic :: Scientific/Engineering - Topic :: Scientific/Engineering :: Chemistry - Topic :: Scientific/Engineering :: Information Analysis - Topic :: Scientific/Engineering :: Physics -keywords = - materials characterization - materials science - baseline - background - baseline correction - baseline subtraction - chemistry - spectroscopy -url = https://github.com/derb12/pybaselines -project_urls = - Source Code = https://github.com/derb12/pybaselines - Documentation = https://pybaselines.readthedocs.io - -[options] -packages = find: -include_package_data = True -python_requires = >=3.6 -install_requires = - numpy>=1.14 - scipy>=1.0 -zip_safe = False - -[options.extras_require] -full = - pentapy>=1.0 - numba>=0.45 - -[options.packages.find] -include = pybaselines, pybaselines.* - -[flake8] -max-line-length = 100 -docstring-convention = numpy -exclude = - docs/* -ignore = - # E731 do not assign a lambda expression, use a def - # W503 line break before binary operator - # W504 line break after binary operator - # D401 first line should be in imperative mood; try rephrasing - E731, - W503, - W504, - D401 -per-file-ignores = - # F401: module imported but unused - # D205: 1 blank line required between summary line and description - __init__.py: F401, D205 - # D400: first line should end with a period - examples/*: D205, D400 diff --git a/setup.py b/setup.py deleted file mode 100644 index e586bf5..0000000 --- a/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -"""The setup script. - -All metadata now exists in setup.cfg. setup.py is now only needed to allow -for editable installs when using older versions of pip. - - -Notes on minimum required versions for dependencies: - -numpy: >= 1.14 in order to use rcond=None with numpy.linalg.lstsq -scipy: >= 1.0 to use the blas function gbmv for banded matrix-vector dot product -pentapy: >= 1.0 to use solver #2 -numba: >= 0.45 in order to cache jit-ed functions with parallel=True - -""" - -from setuptools import setup - - -if __name__ == '__main__': - - setup() diff --git a/tests/conftest.py b/tests/conftest.py index 210e912..73a95dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -54,6 +54,42 @@ def gaussian(x, height=1.0, center=0.0, sigma=1.0): return height * np.exp(-0.5 * ((x - center)**2) / sigma**2) +def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0): + """ + Generates a Gaussian distribution based on height, center, and sigma. + + Parameters + ---------- + x : numpy.ndarray, shape (M, N) + The x-values at which to evaluate the distribution. + z : numpy.ndarray, shape (M, N) + The z-values at which to evaluate the distribution. + height : float, optional + The maximum height of the distribution. Default is 1.0. + center_x : float, optional + The center of the distribution in the x-axis. Default is 0.0. + sigma_x : float, optional + The standard deviation of the distribution in the x-axis. Default is 1.0. + center_z : float, optional + The center of the distribution in the z-axis. Default is 0.0. + sigma_z : float, optional + The standard deviation of the distribution in the z-axis. Default is 1.0. + + Returns + ------- + numpy.ndarray + The Gaussian distribution evaluated with x. + + Notes + ----- + This is the same code as in pybaselines.utils.gaussian, but + this removes the dependence on pybaselines so that if an error + with pybaselines occurs, this will be unaffected. + + """ + return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z) + + def get_data(include_noise=True, num_points=1000): """Creates x- and y-data for testing. @@ -87,24 +123,117 @@ def get_data(include_noise=True, num_points=1000): return x_data, y_data +def get_data2d(include_noise=True, num_points=(30, 41)): + """Creates x-, z-, and y-data for testing. + + Parameters + ---------- + include_noise : bool, optional + If True (default), will include noise with the y-data. + num_points : Container(int, int), optional + The number of data points to use for x, and z, respectively. Default + is (30, 41), which uses different numbers so that any issues caused + by not having a square matrix will be seen. + + Returns + ------- + x_data : numpy.ndarray + The x-values. + z_data : numpy.ndarray + The z-values + y_data : numpy.ndarray + The y-values. + + """ + # TODO use np.random.default_rng(0) once minimum numpy version is >= 1.17 + np.random.seed(0) + x_num_points, z_num_points = num_points + x_data = np.linspace(1, 100, x_num_points) + z_data = np.linspace(1, 120, z_num_points) + X, Z = np.meshgrid(x_data, z_data, indexing='ij') + y_data = ( + 500 # constant baseline + + gaussian2d(X, Z, 10, 25, 25) + + gaussian2d(X, Z, 20, 50, 50) + + gaussian2d(X, Z, 10, 75, 75) + ) + if include_noise: + y_data += np.random.normal(0, 0.5, y_data.shape) + + return x_data, z_data, y_data + + +def get_2dspline_inputs(num_knots=5, spline_degree=3, lam=1, diff_order=2): + """Helper function to handle array-like values for simple cases in testing.""" + if isinstance(num_knots, int): + num_knots_x = num_knots + num_knots_z = num_knots + else: + num_knots_x, num_knots_z = num_knots + if isinstance(spline_degree, int): + spline_degree_x = spline_degree + spline_degree_z = spline_degree + else: + spline_degree_x, spline_degree_z = spline_degree + if isinstance(lam, (int, float)): + lam_x = lam + lam_z = lam + else: + lam_x, lam_z = lam + if isinstance(diff_order, int): + diff_order_x = diff_order + diff_order_z = diff_order + else: + diff_order_x, diff_order_z = diff_order + + return ( + num_knots_x, num_knots_z, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) + + @pytest.fixture def small_data(): """A small array of data for testing.""" return np.arange(10, dtype=float) +@pytest.fixture +def small_data2d(): + """A small array of data for testing.""" + return np.arange(60, dtype=float).reshape(6, 10) + + @pytest.fixture() def data_fixture(): """Test fixture for creating x- and y-data for testing.""" return get_data() +@pytest.fixture() +def data_fixture2d(): + """Test fixture for creating x-, z-, and y-data for testing.""" + return get_data2d() + + @pytest.fixture() def no_noise_data_fixture(): """Test fixture that creates x- and y-data without noise for testing.""" return get_data(include_noise=False) +@pytest.fixture() +def no_noise_data_fixture2d(): + """ + Test fixture that creates x-, z-, and y-data without noise for testing. + + Reduces the number of data points since this is used for testing that numerical + issues are avoided for large iterations in spline and Whittaker functions, which + can otherwise be time consuming. + """ + return get_data2d(include_noise=False, num_points=(20, 31)) + + def dummy_wrapper(func): """A dummy wrapper to simulate using the _Algorithm._register wrapper function.""" @wraps(func) @@ -312,10 +441,10 @@ def test_output(self, additional_keys=None, **kwargs): # check all entries in output param dictionary for key in total_keys: if key not in output[1]: - assert False, f'key "{key}" missing from param dictionary' + raise AssertionError(f'key "{key}" missing from param dictionary') output[1].pop(key) if output[1]: - assert False, f'unchecked keys in param dictionary: {output[1]}' + raise AssertionError(f'unchecked keys in param dictionary: {output[1]}') def test_x_ordering(self, assertion_kwargs=None, **kwargs): """Ensures arrays are correctly sorted within the function.""" @@ -369,33 +498,30 @@ def test_output_coefs(self): class InputWeightsMixin: - """A mixin for BaseTester for ensuring input weights are correctly sorted.""" + """A mixin for BaseTester and BaseTester2D for ensuring input weights are correctly sorted.""" weight_keys = ('weights',) def test_input_weights(self, assertion_kwargs=None, **kwargs): - """ - Ensures arrays are correctly sorted within the function. - - Returns the output for further testing. - - """ + """Ensures input weights are correctly sorted within the function.""" # TODO replace with np.random.default_rng when min numpy version is >= 1.17 - weights = np.random.RandomState(0).normal(0.8, 0.05, len(self.x)) + weights = np.random.RandomState(0).normal(0.8, 0.05, self.y.size) weights = np.clip(weights, 0, 1).astype(float, copy=False) - reverse_fitter = self.algorithm_base(self.x[::-1], assume_sorted=False) + if hasattr(self, 'two_d'): # BaseTester + reverse_fitter = self.algorithm_base(self.x[::-1], assume_sorted=False) + else: # BaseTester2D + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + weights = weights.reshape(self.y.shape) regular_output, regular_output_params = self.class_func( data=self.y, weights=weights, **self.kwargs, **kwargs ) reverse_output, reverse_output_params = getattr(reverse_fitter, self.func_name)( - data=self.reverse_array(self.y), weights=weights[::-1], **self.kwargs, **kwargs + data=self.reverse_array(self.y), weights=self.reverse_array(weights), + **self.kwargs, **kwargs ) - # sanity check, x should always be sorted correctly - assert_allclose(reverse_fitter.x, self.x, rtol=1e-14, atol=1e-14) - if assertion_kwargs is None: assertion_kwargs = {} if 'rtol' not in assertion_kwargs: @@ -404,10 +530,236 @@ def test_input_weights(self, assertion_kwargs=None, **kwargs): assertion_kwargs['atol'] = 1e-14 for key in self.weight_keys: + assert key in regular_output_params + assert key in reverse_output_params + assert_allclose( - regular_output_params[key], reverse_output_params[key][::-1], + regular_output_params[key], self.reverse_array(reverse_output_params[key]), **assertion_kwargs ) assert_allclose( regular_output, self.reverse_array(reverse_output), **assertion_kwargs ) + + +class BaseTester2D: + """ + A base class for testing all 2D algorithms. + + Attributes + ---------- + kwargs : dict + The keyword arguments that will be used as inputs for all default test cases. + + """ + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'func' + checked_keys = None + required_kwargs = None + three_d = False + + @classmethod + def setup_class(cls): + """Sets up the class for testing.""" + cls.x, cls.z, cls.y = get_data2d() + if cls.three_d: + cls.y = np.array((cls.y, cls.y)) + cls.algorithm = cls.algorithm_base(cls.x, cls.z, check_finite=False, assume_sorted=True) + cls.class_func = getattr(cls.algorithm, cls.func_name) + cls.kwargs = cls.required_kwargs if cls.required_kwargs is not None else {} + cls.param_keys = cls.checked_keys if cls.checked_keys is not None else [] + + @classmethod + def teardown_class(cls): + """ + Resets class attributes after testing. + + Probably not needed, but done anyway to catch changes in how pytest works. + + """ + cls.x = None + cls.z = None + cls.y = None + cls.algorithm = None + cls.class_func = None + cls.kwargs = None + cls.param_keys = None + + def test_ensure_wrapped(self): + """Ensures the class method was wrapped using _Algorithm._register to control inputs.""" + assert hasattr(self.class_func, '__wrapped__') + + @pytest.mark.parametrize('new_instance', (True, False)) + def test_unchanged_data(self, new_instance, **kwargs): + """Ensures that input data is unchanged by the function.""" + x, z, y = get_data2d() + x2, z2, y2 = get_data2d() + if self.three_d: + y = np.array((y, y)) + y2 = np.array((y2, y2)) + + if new_instance: + getattr(self.algorithm_base(x_data=x, z_data=z), self.func_name)( + data=y, **self.kwargs, **kwargs + ) + compared_x = x + compared_z = z + else: + self.class_func(data=y, **self.kwargs, **kwargs) + compared_x = self.x + compared_z = self.z + + assert_array_equal(y2, y, err_msg='the y-data was changed by the algorithm') + assert_array_equal(x2, compared_x, err_msg='the x-data was changed by the algorithm') + assert_array_equal(z2, compared_z, err_msg='the z-data was changed by the algorithm') + + def test_repeated_fits(self): + """Ensures the setup is properly reset when using class api.""" + first_output = self.class_func(data=self.y, **self.kwargs) + second_output = self.class_func(data=self.y, **self.kwargs) + + assert_allclose(first_output[0], second_output[0], 1e-14) + + def test_list_input(self, **assertion_kwargs): + """Ensures that function works the same for both array and list inputs.""" + output_array = self.class_func(data=self.y, **self.kwargs) + output_list = self.class_func(data=self.y.tolist(), **self.kwargs) + + assert_allclose( + output_array[0], output_list[0], + err_msg='algorithm output is different for arrays vs lists', **assertion_kwargs + ) + for key in output_array[1]: + assert key in output_list[1] + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z, **assertion_kwargs): + """ + Ensures that function output is the same when no x and/or z is input. + + Usually only valid for evenly spaced data, such as used for testing. + + """ + if has_x and has_z: + return # the one test case that would not produce any difference so skip to save time + output_with = self.class_func(data=self.y, **self.kwargs) + + input_x = self.x if has_x else None + input_z = self.z if has_z else None + output_without = getattr( + self.algorithm_base(x_data=input_x, z_data=input_z), self.func_name + )(data=self.y, **self.kwargs) + + assert_allclose( + output_with[0], output_without[0], + err_msg='algorithm output is different with no x-values and/or z-values', + **assertion_kwargs + ) + + def test_output(self, additional_keys=None, **kwargs): + """ + Ensures that the output has the desired format. + + Ensures that output has two elements, a numpy array and a param dictionary, + and that the output baseline is the same shape as the input y-data. + + Parameters + ---------- + additional_keys : Iterable(str, ...), optional + Additional keys to check for in the output parameter dictionary. Default is None. + **kwargs + Additional keyword arguments to pass to the function. + + """ + output = self.class_func(data=self.y, **self.kwargs, **kwargs) + + assert len(output) == 2, 'algorithm output should have two items' + assert isinstance(output[0], np.ndarray), 'output[0] should be a numpy ndarray' + assert isinstance(output[1], dict), 'output[1] should be a dictionary' + assert self.y.shape == output[0].shape, 'output[0] must have same shape as y-data' + + if additional_keys is not None: + total_keys = list(self.param_keys) + list(additional_keys) + else: + total_keys = self.param_keys + # check all entries in output param dictionary + for key in total_keys: + if key not in output[1]: + raise AssertionError(f'key "{key}" missing from param dictionary') + output[1].pop(key) + if output[1]: + raise AssertionError(f'unchecked keys in param dictionary: {output[1]}') + + def test_xz_ordering(self, assertion_kwargs=None, **kwargs): + """Ensures arrays are correctly sorted within the function.""" + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + + regular_inputs_result = self.class_func(data=self.y, **self.kwargs, **kwargs)[0] + reverse_inputs_result = getattr(reverse_fitter, self.func_name)( + data=self.reverse_array(self.y), **self.kwargs, **kwargs + )[0] + + if assertion_kwargs is None: + assertion_kwargs = {} + if 'rtol' not in assertion_kwargs: + assertion_kwargs['rtol'] = 1e-10 + + assert_allclose( + regular_inputs_result, self.reverse_array(reverse_inputs_result), **assertion_kwargs + ) + + def reverse_array(self, array): + """Reverses the input along the last two dimensions.""" + return np.asarray(array)[..., ::-1, ::-1] + + +class BasePolyTester2D(BaseTester2D): + """ + A base class for testing 2D polynomial algorithms. + + Checks that the polynomial coefficients are correctly returned and that they correspond + to the polynomial used to create the baseline. + + """ + + @pytest.mark.parametrize('return_coef', (True, False)) + def test_output(self, return_coef): + """Ensures the polynomial coefficients are output if `return_coef` is True.""" + if return_coef: + additional_keys = ['coef'] + else: + additional_keys = None + super().test_output(additional_keys=additional_keys, return_coef=return_coef) + + @pytest.mark.parametrize('poly_order', (1, 2, [2, 3])) + def test_output_coefs(self, poly_order): + """ + Ensures the output coefficients can correctly reproduce the baseline. + + Checks both the manual way using the Vandermonde and directly using numpy's polyval2d. + """ + baseline, params = self.class_func( + data=self.y, poly_order=poly_order, **self.kwargs, return_coef=True + ) + + assert 'coef' in params + + if isinstance(poly_order, int): + x_order = poly_order + z_order = poly_order + else: + x_order, z_order = poly_order + + X, Z = np.meshgrid(self.x, self.z, indexing='ij') + vander = np.polynomial.polynomial.polyvander2d( + X, Z, (x_order, z_order) + ).reshape((-1, (x_order + 1) * (z_order + 1))) + + recreated_poly = (vander @ params['coef'].flatten()).reshape(self.y.shape) + assert_allclose(recreated_poly, baseline, rtol=1e-10, atol=1e-12) + + numpy_poly = np.polynomial.polynomial.polyval2d(X, Z, params['coef']) + assert_allclose(numpy_poly, baseline, rtol=1e-10, atol=1e-12) diff --git a/tests/data.py b/tests/data.py index 4257787..503ce91 100644 --- a/tests/data.py +++ b/tests/data.py @@ -459,6 +459,480 @@ ]) } +STATSMODELS_QUANTILES_2D = { + 0.1: np.array([ + 1.82859650e+03, 1.65429084e+03, 1.47998518e+03, 1.30567952e+03, 1.13137386e+03, + 9.57068206e+02, 7.82762547e+02, 6.08456888e+02, 4.34151229e+02, 2.59845570e+02, + 8.55399116e+01, -8.87657472e+01, -2.63071406e+02, -4.37377065e+02, -6.11682724e+02, + -7.85988382e+02, -9.60294041e+02, -1.13459970e+03, -1.30890536e+03, -1.48321102e+03, + -1.65751668e+03, -1.83182234e+03, -2.00612799e+03, -2.18043365e+03, -2.35473931e+03, + -2.52904497e+03, -2.70335063e+03, -2.87765629e+03, -3.05196195e+03, -3.22626761e+03, + -3.40057326e+03, 1.66246682e+03, 1.50205814e+03, 1.34164945e+03, 1.18124077e+03, + 1.02083208e+03, 8.60423398e+02, 7.00014714e+02, 5.39606029e+02, 3.79197345e+02, + 2.18788660e+02, 5.83799758e+01, -1.02028709e+02, -2.62437393e+02, -4.22846078e+02, + -5.83254762e+02, -7.43663447e+02, -9.04072131e+02, -1.06448082e+03, -1.22488950e+03, + -1.38529818e+03, -1.54570687e+03, -1.70611555e+03, -1.86652424e+03, -2.02693292e+03, + -2.18734161e+03, -2.34775029e+03, -2.50815898e+03, -2.66856766e+03, -2.82897635e+03, + -2.98938503e+03, -3.14979371e+03, 1.49633714e+03, 1.34982543e+03, 1.20331372e+03, + 1.05680201e+03, 9.10290301e+02, 7.63778591e+02, 6.17266881e+02, 4.70755171e+02, + 3.24243460e+02, 1.77731750e+02, 3.12200399e+01, -1.15291670e+02, -2.61803380e+02, + -4.08315091e+02, -5.54826801e+02, -7.01338511e+02, -8.47850221e+02, -9.94361931e+02, + -1.14087364e+03, -1.28738535e+03, -1.43389706e+03, -1.58040877e+03, -1.72692048e+03, + -1.87343219e+03, -2.01994390e+03, -2.16645561e+03, -2.31296732e+03, -2.45947903e+03, + -2.60599074e+03, -2.75250245e+03, -2.89901416e+03, 1.33020746e+03, 1.19759273e+03, + 1.06497799e+03, 9.32363255e+02, 7.99748519e+02, 6.67133784e+02, 5.34519048e+02, + 4.01904312e+02, 2.69289576e+02, 1.36674840e+02, 4.06010412e+00, -1.28554632e+02, + -2.61169368e+02, -3.93784104e+02, -5.26398839e+02, -6.59013575e+02, -7.91628311e+02, + -9.24243047e+02, -1.05685778e+03, -1.18947252e+03, -1.32208725e+03, -1.45470199e+03, + -1.58731673e+03, -1.71993146e+03, -1.85254620e+03, -1.98516093e+03, -2.11777567e+03, + -2.25039041e+03, -2.38300514e+03, -2.51561988e+03, -2.64823461e+03, 1.16407778e+03, + 1.04536002e+03, 9.26642261e+02, 8.07924499e+02, 6.89206738e+02, 5.70488976e+02, + 4.51771215e+02, 3.33053453e+02, 2.14335691e+02, 9.56179299e+01, -2.30998317e+01, + -1.41817593e+02, -2.60535355e+02, -3.79253116e+02, -4.97970878e+02, -6.16688640e+02, + -7.35406401e+02, -8.54124163e+02, -9.72841924e+02, -1.09155969e+03, -1.21027745e+03, + -1.32899521e+03, -1.44771297e+03, -1.56643073e+03, -1.68514849e+03, -1.80386626e+03, + -1.92258402e+03, -2.04130178e+03, -2.16001954e+03, -2.27873730e+03, -2.39745506e+03, + 9.97948105e+02, 8.93127318e+02, 7.88306530e+02, 6.83485743e+02, 5.78664956e+02, + 4.73844169e+02, 3.69023381e+02, 2.64202594e+02, 1.59381807e+02, 5.45610197e+01, + -5.02597675e+01, -1.55080555e+02, -2.59901342e+02, -3.64722129e+02, -4.69542917e+02, + -5.74363704e+02, -6.79184491e+02, -7.84005278e+02, -8.88826065e+02, -9.93646853e+02, + -1.09846764e+03, -1.20328843e+03, -1.30810921e+03, -1.41293000e+03, -1.51775079e+03, + -1.62257158e+03, -1.72739236e+03, -1.83221315e+03, -1.93703394e+03, -2.04185473e+03, + -2.14667551e+03, 8.31818426e+02, 7.40894613e+02, 6.49970800e+02, 5.59046987e+02, + 4.68123174e+02, 3.77199361e+02, 2.86275548e+02, 1.95351735e+02, 1.04427923e+02, + 1.35041096e+01, -7.74197034e+01, -1.68343516e+02, -2.59267329e+02, -3.50191142e+02, + -4.41114955e+02, -5.32038768e+02, -6.22962581e+02, -7.13886394e+02, -8.04810207e+02, + -8.95734020e+02, -9.86657833e+02, -1.07758165e+03, -1.16850546e+03, -1.25942927e+03, + -1.35035308e+03, -1.44127690e+03, -1.53220071e+03, -1.62312452e+03, -1.71404834e+03, + -1.80497215e+03, -1.89589596e+03, 6.65688747e+02, 5.88661908e+02, 5.11635070e+02, + 4.34608231e+02, 3.57581393e+02, 2.80554554e+02, 2.03527715e+02, 1.26500877e+02, + 4.94740381e+01, -2.75528006e+01, -1.04579639e+02, -1.81606478e+02, -2.58633316e+02, + -3.35660155e+02, -4.12686994e+02, -4.89713832e+02, -5.66740671e+02, -6.43767509e+02, + -7.20794348e+02, -7.97821187e+02, -8.74848025e+02, -9.51874864e+02, -1.02890170e+03, + -1.10592854e+03, -1.18295538e+03, -1.25998222e+03, -1.33700906e+03, -1.41403590e+03, + -1.49106273e+03, -1.56808957e+03, -1.64511641e+03, 4.99559068e+02, 4.36429204e+02, + 3.73299339e+02, 3.10169475e+02, 2.47039611e+02, 1.83909747e+02, 1.20779882e+02, + 5.76500179e+01, -5.47984640e+00, -6.86097107e+01, -1.31739575e+02, -1.94869439e+02, + -2.57999304e+02, -3.21129168e+02, -3.84259032e+02, -4.47388897e+02, -5.10518761e+02, + -5.73648625e+02, -6.36778489e+02, -6.99908354e+02, -7.63038218e+02, -8.26168082e+02, + -8.89297947e+02, -9.52427811e+02, -1.01555768e+03, -1.07868754e+03, -1.14181740e+03, + -1.20494727e+03, -1.26807713e+03, -1.33120700e+03, -1.39433686e+03, 3.33429389e+02, + 2.84196499e+02, 2.34963609e+02, 1.85730719e+02, 1.36497829e+02, 8.72649391e+01, + 3.80320491e+01, -1.12008409e+01, -6.04337309e+01, -1.09666621e+02, -1.58899511e+02, + -2.08132401e+02, -2.57365291e+02, -3.06598181e+02, -3.55831071e+02, -4.05063961e+02, + -4.54296851e+02, -5.03529741e+02, -5.52762631e+02, -6.01995521e+02, -6.51228411e+02, + -7.00461301e+02, -7.49694191e+02, -7.98927081e+02, -8.48159971e+02, -8.97392861e+02, + -9.46625751e+02, -9.95858641e+02, -1.04509153e+03, -1.09432442e+03, -1.14355731e+03, + 1.67299710e+02, 1.31963794e+02, 9.66278787e+01, 6.12919631e+01, 2.59560474e+01, + -9.37986829e+00, -4.47157840e+01, -8.00516996e+01, -1.15387615e+02, -1.50723531e+02, + -1.86059447e+02, -2.21395362e+02, -2.56731278e+02, -2.92067194e+02, -3.27403109e+02, + -3.62739025e+02, -3.98074941e+02, -4.33410856e+02, -4.68746772e+02, -5.04082688e+02, + -5.39418603e+02, -5.74754519e+02, -6.10090435e+02, -6.45426350e+02, -6.80762266e+02, + -7.16098182e+02, -7.51434097e+02, -7.86770013e+02, -8.22105929e+02, -8.57441844e+02, + -8.92777760e+02, 1.17003110e+00, -2.02689103e+01, -4.17078516e+01, -6.31467930e+01, + -8.45857343e+01, -1.06024676e+02, -1.27463617e+02, -1.48902558e+02, -1.70341500e+02, + -1.91780441e+02, -2.13219382e+02, -2.34658324e+02, -2.56097265e+02, -2.77536207e+02, + -2.98975148e+02, -3.20414089e+02, -3.41853031e+02, -3.63291972e+02, -3.84730913e+02, + -4.06169855e+02, -4.27608796e+02, -4.49047737e+02, -4.70486679e+02, -4.91925620e+02, + -5.13364561e+02, -5.34803503e+02, -5.56242444e+02, -5.77681386e+02, -5.99120327e+02, + -6.20559268e+02, -6.41998210e+02, -1.64959648e+02, -1.72501615e+02, -1.80043582e+02, + -1.87585549e+02, -1.95127516e+02, -2.02669483e+02, -2.10211450e+02, -2.17753417e+02, + -2.25295384e+02, -2.32837351e+02, -2.40379318e+02, -2.47921285e+02, -2.55463252e+02, + -2.63005219e+02, -2.70547186e+02, -2.78089154e+02, -2.85631121e+02, -2.93173088e+02, + -3.00715055e+02, -3.08257022e+02, -3.15798989e+02, -3.23340956e+02, -3.30882923e+02, + -3.38424890e+02, -3.45966857e+02, -3.53508824e+02, -3.61050791e+02, -3.68592758e+02, + -3.76134725e+02, -3.83676692e+02, -3.91218659e+02, -3.31089327e+02, -3.24734320e+02, + -3.18379312e+02, -3.12024305e+02, -3.05669298e+02, -2.99314290e+02, -2.92959283e+02, + -2.86604276e+02, -2.80249269e+02, -2.73894261e+02, -2.67539254e+02, -2.61184247e+02, + -2.54829240e+02, -2.48474232e+02, -2.42119225e+02, -2.35764218e+02, -2.29409211e+02, + -2.23054203e+02, -2.16699196e+02, -2.10344189e+02, -2.03989181e+02, -1.97634174e+02, + -1.91279167e+02, -1.84924160e+02, -1.78569152e+02, -1.72214145e+02, -1.65859138e+02, + -1.59504131e+02, -1.53149123e+02, -1.46794116e+02, -1.40439109e+02, -4.97219006e+02, + -4.76967024e+02, -4.56715043e+02, -4.36463061e+02, -4.16211079e+02, -3.95959098e+02, + -3.75707116e+02, -3.55455135e+02, -3.35203153e+02, -3.14951172e+02, -2.94699190e+02, + -2.74447208e+02, -2.54195227e+02, -2.33943245e+02, -2.13691264e+02, -1.93439282e+02, + -1.73187300e+02, -1.52935319e+02, -1.32683337e+02, -1.12431356e+02, -9.21793741e+01, + -7.19273925e+01, -5.16754109e+01, -3.14234293e+01, -1.11714478e+01, 9.08053383e+00, + 2.93325154e+01, 4.95844970e+01, 6.98364786e+01, 9.00884602e+01, 1.10340442e+02, + -6.63348685e+02, -6.29199729e+02, -5.95050773e+02, -5.60901817e+02, -5.26752861e+02, + -4.92603905e+02, -4.58454949e+02, -4.24305993e+02, -3.90157038e+02, -3.56008082e+02, + -3.21859126e+02, -2.87710170e+02, -2.53561214e+02, -2.19412258e+02, -1.85263302e+02, + -1.51114346e+02, -1.16965390e+02, -8.28164345e+01, -4.86674786e+01, -1.45185227e+01, + 1.96304332e+01, 5.37793891e+01, 8.79283450e+01, 1.22077301e+02, 1.56226257e+02, + 1.90375213e+02, 2.24524169e+02, 2.58673125e+02, 2.92822080e+02, 3.26971036e+02, + 3.61119992e+02, -8.29478364e+02, -7.81432434e+02, -7.33386503e+02, -6.85340573e+02, + -6.37294643e+02, -5.89248713e+02, -5.41202782e+02, -4.93156852e+02, -4.45110922e+02, + -3.97064992e+02, -3.49019062e+02, -3.00973131e+02, -2.52927201e+02, -2.04881271e+02, + -1.56835341e+02, -1.08789411e+02, -6.07434803e+01, -1.26975501e+01, 3.53483801e+01, + 8.33943103e+01, 1.31440241e+02, 1.79486171e+02, 2.27532101e+02, 2.75578031e+02, + 3.23623961e+02, 3.71669892e+02, 4.19715822e+02, 4.67761752e+02, 5.15807682e+02, + 5.63853612e+02, 6.11899543e+02, -9.95608043e+02, -9.33665138e+02, -8.71722234e+02, + -8.09779329e+02, -7.47836425e+02, -6.85893520e+02, -6.23950616e+02, -5.62007711e+02, + -5.00064806e+02, -4.38121902e+02, -3.76178997e+02, -3.14236093e+02, -2.52293188e+02, + -1.90350284e+02, -1.28407379e+02, -6.64644748e+01, -4.52157026e+00, 5.74213343e+01, + 1.19364239e+02, 1.81307143e+02, 2.43250048e+02, 3.05192952e+02, 3.67135857e+02, + 4.29078761e+02, 4.91021666e+02, 5.52964571e+02, 6.14907475e+02, 6.76850380e+02, + 7.38793284e+02, 8.00736189e+02, 8.62679093e+02, -1.16173772e+03, -1.08589784e+03, + -1.01005796e+03, -9.34218085e+02, -8.58378206e+02, -7.82538327e+02, -7.06698449e+02, + -6.30858570e+02, -5.55018691e+02, -4.79178812e+02, -4.03338933e+02, -3.27499054e+02, + -2.51659176e+02, -1.75819297e+02, -9.99794179e+01, -2.41395390e+01, 5.17003398e+01, + 1.27540219e+02, 2.03380097e+02, 2.79219976e+02, 3.55059855e+02, 4.30899734e+02, + 5.06739613e+02, 5.82579492e+02, 6.58419371e+02, 7.34259249e+02, 8.10099128e+02, + 8.85939007e+02, 9.61778886e+02, 1.03761876e+03, 1.11345864e+03, -1.32786740e+03, + -1.23813055e+03, -1.14839369e+03, -1.05865684e+03, -9.68919988e+02, -8.79183135e+02, + -7.89446282e+02, -6.99709429e+02, -6.09972575e+02, -5.20235722e+02, -4.30498869e+02, + -3.40762016e+02, -2.51025163e+02, -1.61288310e+02, -7.15514565e+01, 1.81853967e+01, + 1.07922250e+02, 1.97659103e+02, 2.87395956e+02, 3.77132809e+02, 4.66869663e+02, + 5.56606516e+02, 6.46343369e+02, 7.36080222e+02, 8.25817075e+02, 9.15553928e+02, + 1.00529078e+03, 1.09502763e+03, 1.18476449e+03, 1.27450134e+03, 1.36423819e+03, + -1.49399708e+03, -1.39036325e+03, -1.28672942e+03, -1.18309560e+03, -1.07946177e+03, + -9.75827942e+02, -8.72194115e+02, -7.68560287e+02, -6.64926460e+02, -5.61292632e+02, + -4.57658805e+02, -3.54024977e+02, -2.50391150e+02, -1.46757322e+02, -4.31234950e+01, + 6.05103325e+01, 1.64144160e+02, 2.67777987e+02, 3.71411815e+02, 4.75045642e+02, + 5.78679470e+02, 6.82313297e+02, 7.85947125e+02, 8.89580952e+02, 9.93214780e+02, + 1.09684861e+03, 1.20048243e+03, 1.30411626e+03, 1.40775009e+03, 1.51138392e+03, + 1.61501774e+03, -1.66012676e+03, -1.54259596e+03, -1.42506516e+03, -1.30753435e+03, + -1.19000355e+03, -1.07247275e+03, -9.54941948e+02, -8.37411146e+02, -7.19880344e+02, + -6.02349543e+02, -4.84818741e+02, -3.67287939e+02, -2.49757137e+02, -1.32226335e+02, + -1.46955336e+01, 1.02835268e+02, 2.20366070e+02, 3.37896872e+02, 4.55427674e+02, + 5.72958475e+02, 6.90489277e+02, 8.08020079e+02, 9.25550881e+02, 1.04308168e+03, + 1.16061248e+03, 1.27814329e+03, 1.39567409e+03, 1.51320489e+03, 1.63073569e+03, + 1.74826649e+03, 1.86579730e+03, -1.82625644e+03, -1.69482866e+03, -1.56340089e+03, + -1.43197311e+03, -1.30054533e+03, -1.16911756e+03, -1.03768978e+03, -9.06262005e+02, + -7.74834229e+02, -6.43406453e+02, -5.11978677e+02, -3.80550900e+02, -2.49123124e+02, + -1.17695348e+02, 1.37324278e+01, 1.45160204e+02, 2.76587980e+02, 4.08015756e+02, + 5.39443532e+02, 6.70871308e+02, 8.02299084e+02, 9.33726861e+02, 1.06515464e+03, + 1.19658241e+03, 1.32801019e+03, 1.45943796e+03, 1.59086574e+03, 1.72229352e+03, + 1.85372129e+03, 1.98514907e+03, 2.11657685e+03, -1.99238612e+03, -1.84706137e+03, + -1.70173662e+03, -1.55641187e+03, -1.41108711e+03, -1.26576236e+03, -1.12043761e+03, + -9.75112864e+02, -8.29788113e+02, -6.84463363e+02, -5.39138612e+02, -3.93813862e+02, + -2.48489112e+02, -1.03164361e+02, 4.21603893e+01, 1.87485140e+02, 3.32809890e+02, + 4.78134641e+02, 6.23459391e+02, 7.68784141e+02, 9.14108892e+02, 1.05943364e+03, + 1.20475839e+03, 1.35008314e+03, 1.49540789e+03, 1.64073264e+03, 1.78605739e+03, + 1.93138214e+03, 2.07670690e+03, 2.22203165e+03, 2.36735640e+03, -2.15851580e+03, + -1.99929407e+03, -1.84007235e+03, -1.68085062e+03, -1.52162890e+03, -1.36240717e+03, + -1.20318545e+03, -1.04396372e+03, -8.84741998e+02, -7.25520273e+02, -5.66298548e+02, + -4.07076823e+02, -2.47855099e+02, -8.86333740e+01, 7.05883507e+01, 2.29810075e+02, + 3.89031800e+02, 5.48253525e+02, 7.07475250e+02, 8.66696974e+02, 1.02591870e+03, + 1.18514042e+03, 1.34436215e+03, 1.50358387e+03, 1.66280560e+03, 1.82202732e+03, + 1.98124905e+03, 2.14047077e+03, 2.29969250e+03, 2.45891422e+03, 2.61813595e+03 + ]), + 0.5: np.array([ + 2.05574728e+03, 1.88233772e+03, 1.70892816e+03, 1.53551859e+03, 1.36210903e+03, + 1.18869947e+03, 1.01528991e+03, 8.41880342e+02, 6.68470780e+02, 4.95061217e+02, + 3.21651655e+02, 1.48242092e+02, -2.51674701e+01, -1.98577033e+02, -3.71986595e+02, + -5.45396158e+02, -7.18805720e+02, -8.92215283e+02, -1.06562485e+03, -1.23903441e+03, + -1.41244397e+03, -1.58585353e+03, -1.75926310e+03, -1.93267266e+03, -2.10608222e+03, + -2.27949178e+03, -2.45290135e+03, -2.62631091e+03, -2.79972047e+03, -2.97313003e+03, + -3.14653960e+03, 1.89399572e+03, 1.73424230e+03, 1.57448889e+03, 1.41473547e+03, + 1.25498206e+03, 1.09522864e+03, 9.35475231e+02, 7.75721817e+02, 6.15968403e+02, + 4.56214988e+02, 2.96461574e+02, 1.36708160e+02, -2.30452540e+01, -1.82798668e+02, + -3.42552082e+02, -5.02305496e+02, -6.62058910e+02, -8.21812325e+02, -9.81565739e+02, + -1.14131915e+03, -1.30107257e+03, -1.46082598e+03, -1.62057940e+03, -1.78033281e+03, + -1.94008622e+03, -2.09983964e+03, -2.25959305e+03, -2.41934647e+03, -2.57909988e+03, + -2.73885329e+03, -2.89860671e+03, 1.73224415e+03, 1.58614689e+03, 1.44004962e+03, + 1.29395235e+03, 1.14785509e+03, 1.00175782e+03, 8.55660557e+02, 7.09563291e+02, + 5.63466025e+02, 4.17368759e+02, 2.71271494e+02, 1.25174228e+02, -2.09230379e+01, + -1.67020304e+02, -3.13117569e+02, -4.59214835e+02, -6.05312101e+02, -7.51409367e+02, + -8.97506632e+02, -1.04360390e+03, -1.18970116e+03, -1.33579843e+03, -1.48189570e+03, + -1.62799296e+03, -1.77409023e+03, -1.92018749e+03, -2.06628476e+03, -2.21238202e+03, + -2.35847929e+03, -2.50457656e+03, -2.65067382e+03, 1.57049259e+03, 1.43805147e+03, + 1.30561035e+03, 1.17316923e+03, 1.04072812e+03, 9.08287000e+02, 7.75845882e+02, + 6.43404765e+02, 5.10963648e+02, 3.78522530e+02, 2.46081413e+02, 1.13640296e+02, + -1.88008218e+01, -1.51241939e+02, -2.83683056e+02, -4.16124174e+02, -5.48565291e+02, + -6.81006409e+02, -8.13447526e+02, -9.45888643e+02, -1.07832976e+03, -1.21077088e+03, + -1.34321200e+03, -1.47565311e+03, -1.60809423e+03, -1.74053535e+03, -1.87297646e+03, + -2.00541758e+03, -2.13785870e+03, -2.27029982e+03, -2.40274093e+03, 1.40874102e+03, + 1.28995605e+03, 1.17117108e+03, 1.05238611e+03, 9.33601146e+02, 8.14816177e+02, + 6.96031208e+02, 5.77246239e+02, 4.58461270e+02, 3.39676301e+02, 2.20891332e+02, + 1.02106363e+02, -1.66786057e+01, -1.35463575e+02, -2.54248544e+02, -3.73033513e+02, + -4.91818482e+02, -6.10603451e+02, -7.29388419e+02, -8.48173388e+02, -9.66958357e+02, + -1.08574333e+03, -1.20452830e+03, -1.32331326e+03, -1.44209823e+03, -1.56088320e+03, + -1.67966817e+03, -1.79845314e+03, -1.91723811e+03, -2.03602308e+03, -2.15480805e+03, + 1.24698946e+03, 1.14186064e+03, 1.03673182e+03, 9.31602996e+02, 8.26474175e+02, + 7.21345354e+02, 6.16216534e+02, 5.11087713e+02, 4.05958893e+02, 3.00830072e+02, + 1.95701252e+02, 9.05724310e+01, -1.45563896e+01, -1.19685210e+02, -2.24814031e+02, + -3.29942851e+02, -4.35071672e+02, -5.40200492e+02, -6.45329313e+02, -7.50458134e+02, + -8.55586954e+02, -9.60715775e+02, -1.06584460e+03, -1.17097342e+03, -1.27610224e+03, + -1.38123106e+03, -1.48635988e+03, -1.59148870e+03, -1.69661752e+03, -1.80174634e+03, + -1.90687516e+03, 1.08523789e+03, 9.93765221e+02, 9.02292548e+02, 8.10819876e+02, + 7.19347204e+02, 6.27874532e+02, 5.36401860e+02, 4.44929187e+02, 3.53456515e+02, + 2.61983843e+02, 1.70511171e+02, 7.90384987e+01, -1.24341735e+01, -1.03906846e+02, + -1.95379518e+02, -2.86852190e+02, -3.78324862e+02, -4.69797534e+02, -5.61270207e+02, + -6.52742879e+02, -7.44215551e+02, -8.35688223e+02, -9.27160895e+02, -1.01863357e+03, + -1.11010624e+03, -1.20157891e+03, -1.29305158e+03, -1.38452426e+03, -1.47599693e+03, + -1.56746960e+03, -1.65894227e+03, 9.23486328e+02, 8.45669804e+02, 7.67853281e+02, + 6.90036757e+02, 6.12220233e+02, 5.34403709e+02, 4.56587185e+02, 3.78770662e+02, + 3.00954138e+02, 2.23137614e+02, 1.45321090e+02, 6.75045664e+01, -1.03119574e+01, + -8.81284812e+01, -1.65945005e+02, -2.43761529e+02, -3.21578053e+02, -3.99394576e+02, + -4.77211100e+02, -5.55027624e+02, -6.32844148e+02, -7.10660672e+02, -7.88477195e+02, + -8.66293719e+02, -9.44110243e+02, -1.02192677e+03, -1.09974329e+03, -1.17755981e+03, + -1.25537634e+03, -1.33319286e+03, -1.41100939e+03, 7.61734764e+02, 6.97574388e+02, + 6.33414013e+02, 5.69253637e+02, 5.05093262e+02, 4.40932887e+02, 3.76772511e+02, + 3.12612136e+02, 2.48451760e+02, 1.84291385e+02, 1.20131009e+02, 5.59706341e+01, + -8.18974132e+00, -7.23501167e+01, -1.36510492e+02, -2.00670868e+02, -2.64831243e+02, + -3.28991618e+02, -3.93151994e+02, -4.57312369e+02, -5.21472745e+02, -5.85633120e+02, + -6.49793495e+02, -7.13953871e+02, -7.78114246e+02, -8.42274622e+02, -9.06434997e+02, + -9.70595372e+02, -1.03475575e+03, -1.09891612e+03, -1.16307650e+03, 5.99983199e+02, + 5.49478972e+02, 4.98974745e+02, 4.48470518e+02, 3.97966291e+02, 3.47462064e+02, + 2.96957837e+02, 2.46453610e+02, 1.95949383e+02, 1.45445156e+02, 9.49409288e+01, + 4.44367018e+01, -6.06752523e+00, -5.65717522e+01, -1.07075979e+02, -1.57580206e+02, + -2.08084433e+02, -2.58588660e+02, -3.09092887e+02, -3.59597114e+02, -4.10101341e+02, + -4.60605568e+02, -5.11109795e+02, -5.61614022e+02, -6.12118249e+02, -6.62622476e+02, + -7.13126704e+02, -7.63630931e+02, -8.14135158e+02, -8.64639385e+02, -9.15143612e+02, + 4.38231634e+02, 4.01383556e+02, 3.64535477e+02, 3.27687399e+02, 2.90839320e+02, + 2.53991241e+02, 2.17143163e+02, 1.80295084e+02, 1.43447005e+02, 1.06598927e+02, + 6.97508481e+01, 3.29027695e+01, -3.94530913e+00, -4.07933878e+01, -7.76414664e+01, + -1.14489545e+02, -1.51337624e+02, -1.88185702e+02, -2.25033781e+02, -2.61881860e+02, + -2.98729938e+02, -3.35578017e+02, -3.72426095e+02, -4.09274174e+02, -4.46122253e+02, + -4.82970331e+02, -5.19818410e+02, -5.56666489e+02, -5.93514567e+02, -6.30362646e+02, + -6.67210725e+02, 2.76480070e+02, 2.53288140e+02, 2.30096209e+02, 2.06904279e+02, + 1.83712349e+02, 1.60520419e+02, 1.37328488e+02, 1.14136558e+02, 9.09446279e+01, + 6.77526977e+01, 4.45607674e+01, 2.13688372e+01, -1.82309304e+00, -2.50150233e+01, + -4.82069535e+01, -7.13988838e+01, -9.45908140e+01, -1.17782744e+02, -1.40974674e+02, + -1.64166605e+02, -1.87358535e+02, -2.10550465e+02, -2.33742395e+02, -2.56934326e+02, + -2.80126256e+02, -3.03318186e+02, -3.26510116e+02, -3.49702047e+02, -3.72893977e+02, + -3.96085907e+02, -4.19277837e+02, 1.14728505e+02, 1.05192723e+02, 9.56569416e+01, + 8.61211598e+01, 7.65853779e+01, 6.70495960e+01, 5.75138142e+01, 4.79780323e+01, + 3.84422505e+01, 2.89064686e+01, 1.93706868e+01, 9.83490491e+00, 2.99123057e-01, + -9.23665880e+00, -1.87724407e+01, -2.83082225e+01, -3.78440044e+01, -4.73797862e+01, + -5.69155681e+01, -6.64513499e+01, -7.59871318e+01, -8.55229136e+01, -9.50586955e+01, + -1.04594477e+02, -1.14130259e+02, -1.23666041e+02, -1.33201823e+02, -1.42737605e+02, + -1.52273387e+02, -1.61809168e+02, -1.71344950e+02, -4.70230592e+01, -4.29026927e+01, + -3.87823262e+01, -3.46619596e+01, -3.05415931e+01, -2.64212266e+01, -2.23008600e+01, + -1.81804935e+01, -1.40601270e+01, -9.93976045e+00, -5.81939391e+00, -1.69902738e+00, + 2.42133915e+00, 6.54170568e+00, 1.06620722e+01, 1.47824388e+01, 1.89028053e+01, + 2.30231718e+01, 2.71435384e+01, 3.12639049e+01, 3.53842714e+01, 3.95046379e+01, + 4.36250045e+01, 4.77453710e+01, 5.18657375e+01, 5.59861041e+01, 6.01064706e+01, + 6.42268371e+01, 6.83472037e+01, 7.24675702e+01, 7.65879367e+01, -2.08774624e+02, + -1.90998109e+02, -1.73221594e+02, -1.55445079e+02, -1.37668564e+02, -1.19892049e+02, + -1.02115534e+02, -8.43390194e+01, -6.65625044e+01, -4.87859895e+01, -3.10094746e+01, + -1.32329597e+01, 4.54355525e+00, 2.23200702e+01, 4.00965851e+01, 5.78731000e+01, + 7.56496149e+01, 9.34261299e+01, 1.11202645e+02, 1.28979160e+02, 1.46755675e+02, + 1.64532190e+02, 1.82308704e+02, 2.00085219e+02, 2.17861734e+02, 2.35638249e+02, + 2.53414764e+02, 2.71191279e+02, 2.88967794e+02, 3.06744309e+02, 3.24520824e+02, + -3.70526188e+02, -3.39093525e+02, -3.07660862e+02, -2.76228198e+02, -2.44795535e+02, + -2.13362872e+02, -1.81930209e+02, -1.50497545e+02, -1.19064882e+02, -8.76322186e+01, + -5.61995553e+01, -2.47668920e+01, 6.66577134e+00, 3.80984347e+01, 6.95310980e+01, + 1.00963761e+02, 1.32396425e+02, 1.63829088e+02, 1.95261751e+02, 2.26694415e+02, + 2.58127078e+02, 2.89559741e+02, 3.20992404e+02, 3.52425068e+02, 3.83857731e+02, + 4.15290394e+02, 4.46723058e+02, 4.78155721e+02, 5.09588384e+02, 5.41021048e+02, + 5.72453711e+02, -5.32277753e+02, -4.87188941e+02, -4.42100130e+02, -3.97011318e+02, + -3.51922506e+02, -3.06833694e+02, -2.61744883e+02, -2.16656071e+02, -1.71567259e+02, + -1.26478448e+02, -8.13896360e+01, -3.63008243e+01, 8.78798743e+00, 5.38767991e+01, + 9.89656108e+01, 1.44054423e+02, 1.89143234e+02, 2.34232046e+02, 2.79320858e+02, + 3.24409669e+02, 3.69498481e+02, 4.14587293e+02, 4.59676104e+02, 5.04764916e+02, + 5.49853728e+02, 5.94942540e+02, 6.40031351e+02, 6.85120163e+02, 7.30208975e+02, + 7.75297786e+02, 8.20386598e+02, -6.94029318e+02, -6.35284357e+02, -5.76539397e+02, + -5.17794437e+02, -4.59049477e+02, -4.00304517e+02, -3.41559557e+02, -2.82814597e+02, + -2.24069637e+02, -1.65324677e+02, -1.06579717e+02, -4.78347566e+01, 1.09102035e+01, + 6.96551636e+01, 1.28400124e+02, 1.87145084e+02, 2.45890044e+02, 3.04635004e+02, + 3.63379964e+02, 4.22124924e+02, 4.80869884e+02, 5.39614844e+02, 5.98359804e+02, + 6.57104764e+02, 7.15849725e+02, 7.74594685e+02, 8.33339645e+02, 8.92084605e+02, + 9.50829565e+02, 1.00957452e+03, 1.06831949e+03, -8.55780882e+02, -7.83379774e+02, + -7.10978665e+02, -6.38577557e+02, -5.66176448e+02, -4.93775340e+02, -4.21374231e+02, + -3.48973123e+02, -2.76572014e+02, -2.04170906e+02, -1.31769797e+02, -5.93686889e+01, + 1.30324196e+01, 8.54335281e+01, 1.57834637e+02, 2.30235745e+02, 3.02636854e+02, + 3.75037962e+02, 4.47439070e+02, 5.19840179e+02, 5.92241287e+02, 6.64642396e+02, + 7.37043504e+02, 8.09444613e+02, 8.81845721e+02, 9.54246830e+02, 1.02664794e+03, + 1.09904905e+03, 1.17145016e+03, 1.24385126e+03, 1.31625237e+03, -1.01753245e+03, + -9.31475190e+02, -8.45417933e+02, -7.59360676e+02, -6.73303419e+02, -5.87246162e+02, + -5.01188905e+02, -4.15131649e+02, -3.29074392e+02, -2.43017135e+02, -1.56959878e+02, + -7.09026211e+01, 1.51546357e+01, 1.01211893e+02, 1.87269149e+02, 2.73326406e+02, + 3.59383663e+02, 4.45440920e+02, 5.31498177e+02, 6.17555434e+02, 7.03612691e+02, + 7.89669947e+02, 8.75727204e+02, 9.61784461e+02, 1.04784172e+03, 1.13389897e+03, + 1.21995623e+03, 1.30601349e+03, 1.39207075e+03, 1.47812800e+03, 1.56418526e+03, + -1.17928401e+03, -1.07957061e+03, -9.79857201e+02, -8.80143795e+02, -7.80430390e+02, + -6.80716985e+02, -5.81003580e+02, -4.81290174e+02, -3.81576769e+02, -2.81863364e+02, + -1.82149959e+02, -8.24365534e+01, 1.72768518e+01, 1.16990257e+02, 2.16703662e+02, + 3.16417068e+02, 4.16130473e+02, 5.15843878e+02, 6.15557283e+02, 7.15270689e+02, + 8.14984094e+02, 9.14697499e+02, 1.01441090e+03, 1.11412431e+03, 1.21383771e+03, + 1.31355112e+03, 1.41326453e+03, 1.51297793e+03, 1.61269134e+03, 1.71240474e+03, + 1.81211815e+03, -1.34103558e+03, -1.22766602e+03, -1.11429647e+03, -1.00092691e+03, + -8.87557361e+02, -7.74187808e+02, -6.60818254e+02, -5.47448700e+02, -4.34079147e+02, + -3.20709593e+02, -2.07340039e+02, -9.39704857e+01, 1.93990679e+01, 1.32768622e+02, + 2.46138175e+02, 3.59507729e+02, 4.72877282e+02, 5.86246836e+02, 6.99616390e+02, + 8.12985943e+02, 9.26355497e+02, 1.03972505e+03, 1.15309460e+03, 1.26646416e+03, + 1.37983371e+03, 1.49320327e+03, 1.60657282e+03, 1.71994237e+03, 1.83331193e+03, + 1.94668148e+03, 2.06005103e+03, -1.50278714e+03, -1.37576144e+03, -1.24873574e+03, + -1.12171003e+03, -9.94684332e+02, -8.67658630e+02, -7.40632928e+02, -6.13607226e+02, + -4.86581524e+02, -3.59555822e+02, -2.32530120e+02, -1.05504418e+02, 2.15212840e+01, + 1.48546986e+02, 2.75572688e+02, 4.02598390e+02, 5.29624092e+02, 6.56649794e+02, + 7.83675496e+02, 9.10701198e+02, 1.03772690e+03, 1.16475260e+03, 1.29177830e+03, + 1.41880401e+03, 1.54582971e+03, 1.67285541e+03, 1.79988111e+03, 1.92690681e+03, + 2.05393252e+03, 2.18095822e+03, 2.30798392e+03, -1.66453870e+03, -1.52385685e+03, + -1.38317500e+03, -1.24249315e+03, -1.10181130e+03, -9.61129453e+02, -8.20447602e+02, + -6.79765752e+02, -5.39083902e+02, -3.98402051e+02, -2.57720201e+02, -1.17038350e+02, + 2.36435001e+01, 1.64325351e+02, 3.05007201e+02, 4.45689051e+02, 5.86370902e+02, + 7.27052752e+02, 8.67734603e+02, 1.00841645e+03, 1.14909830e+03, 1.28978015e+03, + 1.43046200e+03, 1.57114385e+03, 1.71182571e+03, 1.85250756e+03, 1.99318941e+03, + 2.13387126e+03, 2.27455311e+03, 2.41523496e+03, 2.55591681e+03, -1.82629027e+03, + -1.67195227e+03, -1.51761427e+03, -1.36327627e+03, -1.20893827e+03, -1.05460028e+03, + -9.00262277e+02, -7.45924278e+02, -5.91586279e+02, -4.37248280e+02, -2.82910281e+02, + -1.28572283e+02, 2.57657162e+01, 1.80103715e+02, 3.34441714e+02, 4.88779713e+02, + 6.43117711e+02, 7.97455710e+02, 9.51793709e+02, 1.10613171e+03, 1.26046971e+03, + 1.41480771e+03, 1.56914570e+03, 1.72348370e+03, 1.87782170e+03, 2.03215970e+03, + 2.18649770e+03, 2.34083570e+03, 2.49517370e+03, 2.64951170e+03, 2.80384969e+03 + ]), + 0.9: np.array([ + 2.37200934e+03, 2.19547028e+03, 2.01893121e+03, 1.84239214e+03, 1.66585308e+03, + 1.48931401e+03, 1.31277495e+03, 1.13623588e+03, 9.59696814e+02, 7.83157748e+02, + 6.06618682e+02, 4.30079616e+02, 2.53540550e+02, 7.70014835e+01, -9.95375825e+01, + -2.76076648e+02, -4.52615714e+02, -6.29154781e+02, -8.05693847e+02, -9.82232913e+02, + -1.15877198e+03, -1.33531104e+03, -1.51185011e+03, -1.68838918e+03, -1.86492824e+03, + -2.04146731e+03, -2.21800637e+03, -2.39454544e+03, -2.57108451e+03, -2.74762357e+03, + -2.92416264e+03, 2.20595640e+03, 2.04331369e+03, 1.88067097e+03, 1.71802825e+03, + 1.55538554e+03, 1.39274282e+03, 1.23010011e+03, 1.06745739e+03, 9.04814676e+02, + 7.42171961e+02, 5.79529245e+02, 4.16886530e+02, 2.54243814e+02, 9.16010988e+01, + -7.10416168e+01, -2.33684332e+02, -3.96327048e+02, -5.58969763e+02, -7.21612479e+02, + -8.84255194e+02, -1.04689791e+03, -1.20954063e+03, -1.37218334e+03, -1.53482606e+03, + -1.69746877e+03, -1.86011149e+03, -2.02275420e+03, -2.18539692e+03, -2.34803963e+03, + -2.51068235e+03, -2.67332507e+03, 2.03990346e+03, 1.89115709e+03, 1.74241073e+03, + 1.59366436e+03, 1.44491800e+03, 1.29617163e+03, 1.14742527e+03, 9.98678904e+02, + 8.49932539e+02, 7.01186174e+02, 5.52439809e+02, 4.03693444e+02, 2.54947079e+02, + 1.06200714e+02, -4.25456511e+01, -1.91292016e+02, -3.40038381e+02, -4.88784746e+02, + -6.37531111e+02, -7.86277476e+02, -9.35023841e+02, -1.08377021e+03, -1.23251657e+03, + -1.38126294e+03, -1.53000930e+03, -1.67875567e+03, -1.82750203e+03, -1.97624840e+03, + -2.12499476e+03, -2.27374113e+03, -2.42248749e+03, 1.87385052e+03, 1.73900050e+03, + 1.60415049e+03, 1.46930048e+03, 1.33445046e+03, 1.19960045e+03, 1.06475043e+03, + 9.29900417e+02, 7.95050402e+02, 6.60200388e+02, 5.25350373e+02, 3.90500358e+02, + 2.55650344e+02, 1.20800329e+02, -1.40496854e+01, -1.48899700e+02, -2.83749715e+02, + -4.18599729e+02, -5.53449744e+02, -6.88299758e+02, -8.23149773e+02, -9.57999788e+02, + -1.09284980e+03, -1.22769982e+03, -1.36254983e+03, -1.49739985e+03, -1.63224986e+03, + -1.76709988e+03, -1.90194989e+03, -2.03679990e+03, -2.17164992e+03, 1.70779758e+03, + 1.58684391e+03, 1.46589025e+03, 1.34493659e+03, 1.22398292e+03, 1.10302926e+03, + 9.82075593e+02, 8.61121929e+02, 7.40168265e+02, 6.19214601e+02, 4.98260937e+02, + 3.77307273e+02, 2.56353609e+02, 1.35399944e+02, 1.44462803e+01, -1.06507384e+02, + -2.27461048e+02, -3.48414712e+02, -4.69368376e+02, -5.90322040e+02, -7.11275704e+02, + -8.32229369e+02, -9.53183033e+02, -1.07413670e+03, -1.19509036e+03, -1.31604403e+03, + -1.43699769e+03, -1.55795135e+03, -1.67890502e+03, -1.79985868e+03, -1.92081235e+03, + 1.54174464e+03, 1.43468732e+03, 1.32763001e+03, 1.22057270e+03, 1.11351538e+03, + 1.00645807e+03, 8.99400755e+02, 7.92343442e+02, 6.85286128e+02, 5.78228814e+02, + 4.71171501e+02, 3.64114187e+02, 2.57056873e+02, 1.49999560e+02, 4.29422460e+01, + -6.41150677e+01, -1.71172381e+02, -2.78229695e+02, -3.85287009e+02, -4.92344322e+02, + -5.99401636e+02, -7.06458950e+02, -8.13516263e+02, -9.20573577e+02, -1.02763089e+03, + -1.13468820e+03, -1.24174552e+03, -1.34880283e+03, -1.45586015e+03, -1.56291746e+03, + -1.66997477e+03, 1.37569170e+03, 1.28253073e+03, 1.18936977e+03, 1.09620881e+03, + 1.00304784e+03, 9.09886880e+02, 8.16725917e+02, 7.23564954e+02, 6.30403991e+02, + 5.37243028e+02, 4.44082064e+02, 3.50921101e+02, 2.57760138e+02, 1.64599175e+02, + 7.14382117e+01, -2.17227515e+01, -1.14883715e+02, -2.08044678e+02, -3.01205641e+02, + -3.94366604e+02, -4.87527567e+02, -5.80688531e+02, -6.73849494e+02, -7.67010457e+02, + -8.60171420e+02, -9.53332383e+02, -1.04649335e+03, -1.13965431e+03, -1.23281527e+03, + -1.32597624e+03, -1.41913720e+03, 1.20963876e+03, 1.13037414e+03, 1.05110953e+03, + 9.71844917e+02, 8.92580304e+02, 8.13315692e+02, 7.34051079e+02, 6.54786466e+02, + 5.75521854e+02, 4.96257241e+02, 4.16992628e+02, 3.37728015e+02, 2.58463403e+02, + 1.79198790e+02, 9.99341773e+01, 2.06695646e+01, -5.85950481e+01, -1.37859661e+02, + -2.17124273e+02, -2.96388886e+02, -3.75653499e+02, -4.54918112e+02, -5.34182724e+02, + -6.13447337e+02, -6.92711950e+02, -7.71976562e+02, -8.51241175e+02, -9.30505788e+02, + -1.00977040e+03, -1.08903501e+03, -1.16829963e+03, 1.04358581e+03, 9.78217552e+02, + 9.12849290e+02, 8.47481028e+02, 7.82112765e+02, 7.16744503e+02, 6.51376241e+02, + 5.86007979e+02, 5.20639716e+02, 4.55271454e+02, 3.89903192e+02, 3.24534930e+02, + 2.59166668e+02, 1.93798405e+02, 1.28430143e+02, 6.30618808e+01, -2.30638144e+00, + -6.76746437e+01, -1.33042906e+02, -1.98411168e+02, -2.63779430e+02, -3.29147693e+02, + -3.94515955e+02, -4.59884217e+02, -5.25252479e+02, -5.90620742e+02, -6.55989004e+02, + -7.21357266e+02, -7.86725528e+02, -8.52093791e+02, -9.17462053e+02, 8.77532873e+02, + 8.26060962e+02, 7.74589050e+02, 7.23117138e+02, 6.71645226e+02, 6.20173315e+02, + 5.68701403e+02, 5.17229491e+02, 4.65757579e+02, 4.14285668e+02, 3.62813756e+02, + 3.11341844e+02, 2.59869932e+02, 2.08398020e+02, 1.56926109e+02, 1.05454197e+02, + 5.39822852e+01, 2.51037343e+00, -4.89615383e+01, -1.00433450e+02, -1.51905362e+02, + -2.03377274e+02, -2.54849185e+02, -3.06321097e+02, -3.57793009e+02, -4.09264921e+02, + -4.60736832e+02, -5.12208744e+02, -5.63680656e+02, -6.15152568e+02, -6.66624480e+02, + 7.11479933e+02, 6.73904371e+02, 6.36328810e+02, 5.98753249e+02, 5.61177687e+02, + 5.23602126e+02, 4.86026565e+02, 4.48451003e+02, 4.10875442e+02, 3.73299881e+02, + 3.35724320e+02, 2.98148758e+02, 2.60573197e+02, 2.22997636e+02, 1.85422074e+02, + 1.47846513e+02, 1.10270952e+02, 7.26953905e+01, 3.51198292e+01, -2.45573205e+00, + -4.00312933e+01, -7.76068546e+01, -1.15182416e+02, -1.52757977e+02, -1.90333539e+02, + -2.27909100e+02, -2.65484661e+02, -3.03060222e+02, -3.40635784e+02, -3.78211345e+02, + -4.15786906e+02, 5.45426992e+02, 5.21747781e+02, 4.98068570e+02, 4.74389359e+02, + 4.50710148e+02, 4.27030937e+02, 4.03351727e+02, 3.79672516e+02, 3.55993305e+02, + 3.32314094e+02, 3.08634883e+02, 2.84955673e+02, 2.61276462e+02, 2.37597251e+02, + 2.13918040e+02, 1.90238829e+02, 1.66559618e+02, 1.42880408e+02, 1.19201197e+02, + 9.55219860e+01, 7.18427752e+01, 4.81635644e+01, 2.44843535e+01, 8.05142715e-01, + -2.28740681e+01, -4.65532789e+01, -7.02324897e+01, -9.39117006e+01, -1.17590911e+02, + -1.41270122e+02, -1.64949333e+02, 3.79374051e+02, 3.69591190e+02, 3.59808330e+02, + 3.50025470e+02, 3.40242609e+02, 3.30459749e+02, 3.20676889e+02, 3.10894028e+02, + 3.01111168e+02, 2.91328308e+02, 2.81545447e+02, 2.71762587e+02, 2.61979726e+02, + 2.52196866e+02, 2.42414006e+02, 2.32631145e+02, 2.22848285e+02, 2.13065425e+02, + 2.03282564e+02, 1.93499704e+02, 1.83716844e+02, 1.73933983e+02, 1.64151123e+02, + 1.54368263e+02, 1.44585402e+02, 1.34802542e+02, 1.25019682e+02, 1.15236821e+02, + 1.05453961e+02, 9.56711006e+01, 8.58882402e+01, 2.13321110e+02, 2.17434600e+02, + 2.21548090e+02, 2.25661580e+02, 2.29775070e+02, 2.33888560e+02, 2.38002050e+02, + 2.42115541e+02, 2.46229031e+02, 2.50342521e+02, 2.54456011e+02, 2.58569501e+02, + 2.62682991e+02, 2.66796481e+02, 2.70909971e+02, 2.75023462e+02, 2.79136952e+02, + 2.83250442e+02, 2.87363932e+02, 2.91477422e+02, 2.95590912e+02, 2.99704402e+02, + 3.03817892e+02, 3.07931383e+02, 3.12044873e+02, 3.16158363e+02, 3.20271853e+02, + 3.24385343e+02, 3.28498833e+02, 3.32612323e+02, 3.36725813e+02, 4.72681688e+01, + 6.52780094e+01, 8.32878500e+01, 1.01297691e+02, 1.19307531e+02, 1.37317372e+02, + 1.55327212e+02, 1.73337053e+02, 1.91346894e+02, 2.09356734e+02, 2.27366575e+02, + 2.45376415e+02, 2.63386256e+02, 2.81396097e+02, 2.99405937e+02, 3.17415778e+02, + 3.35425618e+02, 3.53435459e+02, 3.71445300e+02, 3.89455140e+02, 4.07464981e+02, + 4.25474821e+02, 4.43484662e+02, 4.61494503e+02, 4.79504343e+02, 4.97514184e+02, + 5.15524024e+02, 5.33533865e+02, 5.51543705e+02, 5.69553546e+02, 5.87563387e+02, + -1.18784772e+02, -8.68785810e+01, -5.49723899e+01, -2.30661989e+01, 8.83999220e+00, + 4.07461833e+01, 7.26523743e+01, 1.04558565e+02, 1.36464756e+02, 1.68370948e+02, + 2.00277139e+02, 2.32183330e+02, 2.64089521e+02, 2.95995712e+02, 3.27901903e+02, + 3.59808094e+02, 3.91714285e+02, 4.23620476e+02, 4.55526667e+02, 4.87432858e+02, + 5.19339049e+02, 5.51245240e+02, 5.83151431e+02, 6.15057622e+02, 6.46963814e+02, + 6.78870005e+02, 7.10776196e+02, 7.42682387e+02, 7.74588578e+02, 8.06494769e+02, + 8.38400960e+02, -2.84837713e+02, -2.39035171e+02, -1.93232630e+02, -1.47430088e+02, + -1.01627547e+02, -5.58250053e+01, -1.00224638e+01, 3.57800778e+01, 8.15826193e+01, + 1.27385161e+02, 1.73187702e+02, 2.18990244e+02, 2.64792785e+02, 3.10595327e+02, + 3.56397869e+02, 4.02200410e+02, 4.48002952e+02, 4.93805493e+02, 5.39608035e+02, + 5.85410576e+02, 6.31213118e+02, 6.77015659e+02, 7.22818201e+02, 7.68620742e+02, + 8.14423284e+02, 8.60225825e+02, 9.06028367e+02, 9.51830909e+02, 9.97633450e+02, + 1.04343599e+03, 1.08923853e+03, -4.50890654e+02, -3.91191762e+02, -3.31492870e+02, + -2.71793978e+02, -2.12095086e+02, -1.52396194e+02, -9.26973018e+01, -3.29984098e+01, + 2.67004822e+01, 8.63993742e+01, 1.46098266e+02, 2.05797158e+02, 2.65496050e+02, + 3.25194942e+02, 3.84893834e+02, 4.44592726e+02, 5.04291618e+02, 5.63990510e+02, + 6.23689402e+02, 6.83388294e+02, 7.43087186e+02, 8.02786078e+02, 8.62484970e+02, + 9.22183862e+02, 9.81882754e+02, 1.04158165e+03, 1.10128054e+03, 1.16097943e+03, + 1.22067832e+03, 1.28037721e+03, 1.34007611e+03, -6.16943595e+02, -5.43348352e+02, + -4.69753110e+02, -3.96157867e+02, -3.22562625e+02, -2.48967382e+02, -1.75372140e+02, + -1.01776897e+02, -2.81816550e+01, 4.54135875e+01, 1.19008830e+02, 1.92604072e+02, + 2.66199315e+02, 3.39794557e+02, 4.13389800e+02, 4.86985042e+02, 5.60580285e+02, + 6.34175527e+02, 7.07770770e+02, 7.81366012e+02, 8.54961255e+02, 9.28556497e+02, + 1.00215174e+03, 1.07574698e+03, 1.14934222e+03, 1.22293747e+03, 1.29653271e+03, + 1.37012795e+03, 1.44372319e+03, 1.51731844e+03, 1.59091368e+03, -7.82996536e+02, + -6.95504943e+02, -6.08013350e+02, -5.20521757e+02, -4.33030164e+02, -3.45538571e+02, + -2.58046978e+02, -1.70555385e+02, -8.30637921e+01, 4.42780085e+00, 9.19193938e+01, + 1.79410987e+02, 2.66902580e+02, 3.54394173e+02, 4.41885766e+02, 5.29377359e+02, + 6.16868952e+02, 7.04360544e+02, 7.91852137e+02, 8.79343730e+02, 9.66835323e+02, + 1.05432692e+03, 1.14181851e+03, 1.22931010e+03, 1.31680170e+03, 1.40429329e+03, + 1.49178488e+03, 1.57927647e+03, 1.66676807e+03, 1.75425966e+03, 1.84175125e+03, + -9.49049477e+02, -8.47661533e+02, -7.46273590e+02, -6.44885646e+02, -5.43497703e+02, + -4.42109760e+02, -3.40721816e+02, -2.39333873e+02, -1.37945929e+02, -3.65579858e+01, + 6.48299576e+01, 1.66217901e+02, 2.67605844e+02, 3.68993788e+02, 4.70381731e+02, + 5.71769675e+02, 6.73157618e+02, 7.74545562e+02, 8.75933505e+02, 9.77321448e+02, + 1.07870939e+03, 1.18009734e+03, 1.28148528e+03, 1.38287322e+03, 1.48426117e+03, + 1.58564911e+03, 1.68703705e+03, 1.78842500e+03, 1.88981294e+03, 1.99120088e+03, + 2.09258883e+03, -1.11510242e+03, -9.99818124e+02, -8.84533830e+02, -7.69249536e+02, + -6.53965242e+02, -5.38680948e+02, -4.23396654e+02, -3.08112360e+02, -1.92828066e+02, + -7.75437725e+01, 3.77405214e+01, 1.53024815e+02, 2.68309109e+02, 3.83593403e+02, + 4.98877697e+02, 6.14161991e+02, 7.29446285e+02, 8.44730579e+02, 9.60014873e+02, + 1.07529917e+03, 1.19058346e+03, 1.30586775e+03, 1.42115205e+03, 1.53643634e+03, + 1.65172064e+03, 1.76700493e+03, 1.88228922e+03, 1.99757352e+03, 2.11285781e+03, + 2.22814211e+03, 2.34342640e+03, -1.28115536e+03, -1.15197471e+03, -1.02279407e+03, + -8.93613425e+02, -7.64432781e+02, -6.35252137e+02, -5.06071492e+02, -3.76890848e+02, + -2.47710204e+02, -1.18529559e+02, 1.06510852e+01, 1.39831730e+02, 2.69012374e+02, + 3.98193018e+02, 5.27373663e+02, 6.56554307e+02, 7.85734951e+02, 9.14915596e+02, + 1.04409624e+03, 1.17327688e+03, 1.30245753e+03, 1.43163817e+03, 1.56081882e+03, + 1.68999946e+03, 1.81918011e+03, 1.94836075e+03, 2.07754140e+03, 2.20672204e+03, + 2.33590268e+03, 2.46508333e+03, 2.59426397e+03, -1.44720830e+03, -1.30413130e+03, + -1.16105431e+03, -1.01797731e+03, -8.74900320e+02, -7.31823325e+02, -5.88746330e+02, + -4.45669336e+02, -3.02592341e+02, -1.59515346e+02, -1.64383510e+01, 1.26638644e+02, + 2.69715639e+02, 4.12792634e+02, 5.55869628e+02, 6.98946623e+02, 8.42023618e+02, + 9.85100613e+02, 1.12817761e+03, 1.27125460e+03, 1.41433160e+03, 1.55740859e+03, + 1.70048559e+03, 1.84356258e+03, 1.98663958e+03, 2.12971657e+03, 2.27279357e+03, + 2.41587056e+03, 2.55894756e+03, 2.70202455e+03, 2.84510155e+03, -1.61326124e+03, + -1.45628789e+03, -1.29931455e+03, -1.14234120e+03, -9.85367859e+02, -8.28394514e+02, + -6.71421168e+02, -5.14447823e+02, -3.57474478e+02, -2.00501133e+02, -4.35277872e+01, + 1.13445558e+02, 2.70418903e+02, 4.27392249e+02, 5.84365594e+02, 7.41338939e+02, + 8.98312285e+02, 1.05528563e+03, 1.21225898e+03, 1.36923232e+03, 1.52620567e+03, + 1.68317901e+03, 1.84015236e+03, 1.99712570e+03, 2.15409905e+03, 2.31107239e+03, + 2.46804574e+03, 2.62501908e+03, 2.78199243e+03, 2.93896577e+03, 3.09593912e+03 + ]) +} + # continuous wavetlet transform using the Haar wavelet from pywavelets, after # patching with pywavelets issue #365 and pywavelets pull request #580 to make # pywavelets's cwt work with the Haar wavelet; the input array was: diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index c196611..c2f4fa2 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -9,10 +9,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import dia_matrix from pybaselines import _algorithm_setup, optimizers, polynomial, whittaker from pybaselines.utils import ParameterWarning +from pybaselines._compat import dia_object from .conftest import get_data @@ -48,7 +48,7 @@ def test_setup_whittaker_diff_matrix(small_data, algorithm, lam, diff_order, ) numpy_diff = np.diff(np.eye(small_data.shape[0]), diff_order, 0) - desired_diagonals = dia_matrix(lam * (numpy_diff.T @ numpy_diff)).data[::-1] + desired_diagonals = dia_object(lam * (numpy_diff.T @ numpy_diff)).data[::-1] if allow_lower and not algorithm.whittaker_system.using_pentapy: # only include the lower diagonals desired_diagonals = desired_diagonals[diff_order:] @@ -202,6 +202,24 @@ def test_setup_polynomial_vandermonde(small_data, algorithm, vander_enum, includ assert_allclose(desired_pinv, pinv_matrix, 1e-10) +def test_setup_polynomial_negative_polyorder_fails(small_data, algorithm): + """Ensures a negative poly_order raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=-1) + + +def test_setup_polynomial_too_large_polyorder_fails(small_data, algorithm): + """Ensures an exception is raised if poly_order has more than one value.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=[1, 2]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data, poly_order=np.array([1, 2])) + + def test_setup_smooth_shape(small_data, algorithm): """Ensures output y is correctly padded.""" pad_length = 4 @@ -270,7 +288,7 @@ def test_setup_spline_diff_matrix(small_data, lam, diff_order, spline_degree, nu num_bases = num_knots + spline_degree - 1 numpy_diff = np.diff(np.eye(num_bases), diff_order, axis=0) - desired_diagonals = lam * dia_matrix(numpy_diff.T @ numpy_diff).data[::-1][diff_order:] + desired_diagonals = lam * dia_object(numpy_diff.T @ numpy_diff).data[::-1][diff_order:] if diff_order < spline_degree: padding = np.zeros((spline_degree - diff_order, desired_diagonals.shape[1])) desired_diagonals = np.concatenate((desired_diagonals, padding)) @@ -349,6 +367,32 @@ def test_setup_spline_negative_lam_fails(small_data): ) +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_spline_weights(small_data, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones_like(small_data) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data) + desired_weights = weights.copy() + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data.shape[0]) + desired_weights = np.arange(small_data.shape[0]) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data.shape[0]).tolist() + desired_weights = np.arange(small_data.shape[0]) + + _, weight_array = algorithm._setup_spline(small_data, lam=1, diff_order=2, weights=weights) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + def test_setup_spline_array_lam(small_data): """Ensures a lam that is a single array passes while larger arrays fail.""" _algorithm_setup._Algorithm(np.arange(len(small_data)))._setup_spline(small_data, lam=[1]) @@ -466,10 +510,9 @@ def test_algorithm_class_init(input_x, check_finite, assume_sorted, output_dtype else: assert algorithm._len is None - if not assume_sorted and input_x: + if not assume_sorted and change_order and input_x: order = np.arange(len(x)) - if change_order: - order[sort_order] = order[sort_order][::-1] + order[sort_order] = order[sort_order][::-1] assert_array_equal(algorithm._sort_order, order) assert_array_equal(algorithm._inverted_order, order.argsort()) else: @@ -548,6 +591,7 @@ class SubClass(_algorithm_setup._Algorithm): # 'a' values will be sorted and 'b' values will be kept the same @_algorithm_setup._Algorithm._register(sort_keys=('a',)) def func(self, data, *args, **kwargs): + """For checking sorting of output parameters.""" expected_x = np.arange(20) if change_order and assume_sorted: expected_x[sort_indices] = expected_x[sort_indices][::-1] diff --git a/tests/test_api.py b/tests/test_api.py index b76e1b2..1b83efd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -46,7 +46,11 @@ def get_public_methods(klass): """ methods = [] for method in dir(klass): - if not (method.startswith('_') or method.startswith('pentapy_solver')): + if ( + not (method.startswith('_') + or method.startswith('pentapy_solver') + or method.startswith('get_method')) + ): methods.append(method) return methods @@ -107,13 +111,13 @@ def test_all_methods(self, method_and_class): baseline_class(self.x, check_finite=False, assume_sorted=True), method )(fit_data, **kwargs) - assert_allclose(api_baseline, class_baseline, rtol=1e-14, atol=1e-14) + assert_allclose(api_baseline, class_baseline, rtol=1e-12, atol=1e-12) assert len(api_params.keys()) == len(class_params.keys()) for key, value in api_params.items(): assert key in class_params class_value = class_params[key] if isinstance(value, (int, float, np.ndarray, list, tuple)): - assert_allclose(value, class_value, rtol=1e-14, atol=1e-14) + assert_allclose(value, class_value, rtol=1e-12, atol=1e-12) else: assert value == class_value @@ -147,3 +151,17 @@ def test_pentapy_solver(self): fitter.pentapy_solver = 3 assert fitter.whittaker_system.pentapy_solver == fitter.pentapy_solver + + def test_get_method(self): + """Ensures the get_method helper function works as intended.""" + method = self.algorithm._get_method('asls') + assert method == self.algorithm.asls + + # also ensure capitalization does not matter + method2 = self.algorithm._get_method('AsLS') + assert method2 == self.algorithm.asls + + def test_get_method_fails(self): + """Ensures the get_method helper function fails when an incorrect name is given.""" + with pytest.raises(AttributeError): + self.algorithm._get_method('aaaaaaaaaaaaa') diff --git a/tests/test_banded_utils.py b/tests/test_banded_utils.py index 667d386..fbc2543 100644 --- a/tests/test_banded_utils.py +++ b/tests/test_banded_utils.py @@ -9,10 +9,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import diags, identity, spdiags from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils +from pybaselines._compat import diags, dia_object, identity from .conftest import has_pentapy @@ -107,6 +107,37 @@ def test_diff_penalty_diagonals_datasize_too_small(): _banded_utils.diff_penalty_diagonals(-1) + +@pytest.mark.parametrize('data_size', (10, 51)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = _banded_utils.difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + output = _banded_utils.diff_penalty_matrix(data_size, diff_order) + + assert_allclose(expected_matrix.toarray(), output.toarray(), rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('data_size', (3, 6)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_diff_penalty_matrix_too_few_data(data_size, diff_order): + """Ensures the penalty matrix shortcut works correctly.""" + diff_matrix = _banded_utils.difference_matrix(data_size, diff_order) + expected_matrix = diff_matrix.T @ diff_matrix + + if data_size <= diff_order: + with pytest.raises(ValueError): + _banded_utils.diff_penalty_matrix(data_size, diff_order) + # the actual matrix should be just zeros + actual_result = np.zeros((data_size, data_size)) + assert_allclose(actual_result, expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + else: + output = _banded_utils.diff_penalty_matrix(data_size, diff_order) + assert_allclose(output.toarray(), expected_matrix.toarray(), rtol=1e-12, atol=1e-12) + + def test_shift_rows_2_diags(): """Ensures rows are correctly shifted for a matrix with two off-diagonals on either side.""" matrix = np.array([ @@ -273,14 +304,14 @@ def test_add_diagonals(diff_order_1, diff_order_2, lower_only): a_offsets = np.arange(diff_order_1, -diff_order_1 - 1, -1) b_offsets = np.arange(diff_order_2, -diff_order_2 - 1, -1) - a_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(points, diff_order_1, False), - a_offsets, points, points, 'csr' - ) - b_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(points, diff_order_2, False), - b_offsets, points, points, 'csr' - ) + a_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(points, diff_order_1, False), a_offsets), + shape=(points, points) + ).tocsr() + b_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(points, diff_order_2, False), b_offsets), + shape=(points, points) + ).tocsr() expected_output = (a_matrix + b_matrix).todia().data[::-1] if lower_only: expected_output = expected_output[len(expected_output) // 2:] @@ -544,11 +575,10 @@ def test_penalized_system_solve(data_fixture, diff_order, allow_lower, allow_pen expected_penalty = _banded_utils.diff_penalty_diagonals( data_size, diff_order=diff_order, lower_only=False ) - sparse_penalty = spdiags( - lam * expected_penalty, np.arange(diff_order, -(diff_order + 1), -1), - data_size, data_size, 'csr' - - ) + sparse_penalty = dia_object( + (lam * expected_penalty, np.arange(diff_order, -(diff_order + 1), -1)), + shape=(data_size, data_size) + ).tocsr() expected_solution = spsolve(identity(data_size, format='csr') + sparse_penalty, y) penalized_system = _banded_utils.PenalizedSystem( @@ -687,9 +717,9 @@ def test_penalized_system_add_diagonal_after_penalty(data_size, diff_order, allo additional_penalty = _banded_utils.diff_penalty_diagonals( data_size, penalty_order, lower_only=False ) - additional_penalty_matrix = spdiags( - additional_penalty, np.arange(penalty_order, -penalty_order - 1, -1), data_size, - data_size + additional_penalty_matrix = dia_object( + (additional_penalty, np.arange(penalty_order, -penalty_order - 1, -1)), + shape=(data_size, data_size) ) total_penalty = penalty + additional_penalty_matrix diff --git a/tests/test_classification.py b/tests/test_classification.py index d6c9009..7dabe32 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -10,7 +10,6 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest import scipy -from scipy.signal import cwt from pybaselines import classification from pybaselines.utils import ParameterWarning, whittaker_smooth @@ -261,7 +260,7 @@ def test_haar_cwt_comparison_to_pywavelets(scale): y = np.zeros(100) y[50] = 1 - haar_cwt = cwt(y, classification._haar, [scale])[0] + haar_cwt = classification._cwt(y, classification._haar, [scale])[0] # test absolute tolerance rather than relative tolerance since # some values are very close to 0 assert_allclose(haar_cwt**2, PYWAVELETS_HAAR[scale]**2, 0, 1e-14) diff --git a/tests/test_compat.py b/tests/test_compat.py index 8f90ae4..adc588a 100644 --- a/tests/test_compat.py +++ b/tests/test_compat.py @@ -6,9 +6,13 @@ """ -from numpy.testing import assert_array_equal +from unittest import mock + +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose import pytest -from scipy import integrate +import scipy +from scipy import integrate, sparse from pybaselines import _compat @@ -145,8 +149,179 @@ def _add4(a, b): def test_trapezoid(): - """Ensures the trapezoid integration function within scipy is correctly used.""" + """ + Ensures the trapezoid integration function within scipy is correctly used. + + Rather than checking equality with the expected function, just check that + it works correctly. + """ + data = [1., 2., 3.] + output = _compat.trapezoid(data) + assert_allclose(output, 4.0, rtol=0, atol=1e-14) + if hasattr(integrate, 'trapezoid'): - assert _compat.trapezoid is integrate.trapezoid + comparison_func = integrate.trapezoid + else: + comparison_func = integrate.trapz + + assert_allclose(output, comparison_func(data), rtol=0, atol=1e-14) + + +def _scipy_below_1_12(): + """ + Checks that the installed scipy version is new enough to use sparse arrays. + + This check is wrapped into a function just in case it fails so that pybaselines + can still be imported without error. The result is cached so it only has to + be done once. + + Returns + ------- + bool + True if the installed scipy version is below 1.12; False otherwise. + + Notes + ----- + Scipy introduced its sparse arrays in version 1.8, but the interface and helper + functions were not stable until version 1.12; a warning will be emitted in scipy + 1.13 when using the matrix interface, so want to use the sparse array interface + as early as possible. + + """ + try: + _scipy_version = [int(val) for val in scipy.__version__.lstrip('v').split('.')[:2]] + except Exception: + # in case in the far future scipy stops using semantic versioning; probably + # bigger problems than this check at that point so just return True + return False + + return not (_scipy_version[0] > 1 or (_scipy_version[0] == 1 and _scipy_version[1] >= 12)) + + +def test_use_sparse_arrays(): + """ + Ensures the scipy version check works correctly. + + Use try-finally so that even if the test fails, the mocked values do + not remain, which would cause subsequent tests to fail. + """ + try: + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '0.1'): + assert not _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '1.11'): + assert not _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '1.12'): + assert _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + with mock.patch.object(scipy, '__version__', '2.0'): + assert _compat._use_sparse_arrays() + + _compat._use_sparse_arrays.cache_clear() + # sanity check that cache was cleared + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + # check that it returns True when an error reading the scipy version occurs + with mock.patch.object(scipy, '__version__', 'abc'): + assert _compat._use_sparse_arrays() + finally: + _compat._use_sparse_arrays.cache_clear() + # ensure the cache is cleared so the correct value can be filled so the next call + # to it is correct + assert _compat._use_sparse_arrays.cache_info().currsize == 0 + + +def test_dia_object(): + """Ensures the compatibilty for dia_matrix and dia_array works as intended.""" + data = np.array([ + [1, 2, 0], + [4, 5, 6], + [0, 8, 9] + ]) + offsets = [-1, 0, 1] + output = _compat.dia_object((data, offsets), shape=(3, 3)) + + expected_output = np.array([ + [4, 8, 0], + [1, 5, 9], + [0, 2, 6] + ]) + + assert sparse.issparse(output) + assert output.format == 'dia' + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +def test_csr_object(): + """Ensures the compatibilty for csr_matrix and csr_array works as intended.""" + row = np.array([0, 1, 1, 2]) + col = np.array([0, 0, 2, 0]) + data = np.array([3, 5, 7, 9]) + output = _compat.csr_object((data, (row, col)), shape=(3, 3)) + + expected_output = np.array([ + [3, 0, 0], + [5, 0, 7], + [9, 0, 0] + ]) + + assert sparse.issparse(output) + assert output.format == 'csr' + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +@pytest.mark.parametrize('sparse_format', ('csc', 'csr', 'dia')) +@pytest.mark.parametrize('size', (1, 3, 6)) +def test_identity(size, sparse_format): + """Ensures the sparse identity function works correctly.""" + output = _compat.identity(size, format=sparse_format) + + assert sparse.issparse(output) + assert output.format == sparse_format + assert_allclose(output.toarray(), np.eye(size), rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) + else: + assert not sparse.isspmatrix(output) + + +@pytest.mark.parametrize('sparse_format', ('csc', 'csr', 'dia')) +def test_diags(sparse_format): + """Ensures the sparse diags function works as intended.""" + data = [-1, 2, 1] + offsets = [-1, 0, 1] + output = _compat.diags(data, offsets=offsets, shape=(3, 3), format=sparse_format) + + expected_output = np.array([ + [2, 1, 0], + [-1, 2, 1], + [0, -1, 2] + ]) + + assert sparse.issparse(output) + assert output.format == sparse_format + assert_allclose(output.toarray(), expected_output, rtol=0, atol=1e-14) + if _scipy_below_1_12(): + assert sparse.isspmatrix(output) else: - assert _compat.trapezoid is integrate.trapz + assert not sparse.isspmatrix(output) diff --git a/tests/test_meta.py b/tests/test_meta.py index 75d483c..ccea8c4 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -12,7 +12,10 @@ from numpy.testing import assert_allclose import pytest -from .conftest import BaseTester, BasePolyTester, dummy_wrapper, get_data +from .conftest import ( + BaseTester, BaseTester2D, BasePolyTester, InputWeightsMixin, dummy_wrapper, get_data, + get_data2d +) class DummyModule: @@ -48,6 +51,35 @@ def good_poly_func(data, x_data=None, return_coef=False, **kwargs): return baseline, params + @staticmethod + def bad_poly_func(data, x_data=None, return_coef=False, **kwargs): + """A bad polynomial algorithm.""" + params = {'a': 1} + if not return_coef: + params['coef'] = np.zeros(5) + + return np.ones_like(data), params + + @staticmethod + def good_weights_func(data, x_data=None, weights=None, **kwargs): + """A good algorithm that can take weights.""" + return np.ones_like(data), {'a': 1, 'weights': np.ones_like(data)} + + @staticmethod + def good_mask_func(data, x_data=None, weights=None, **kwargs): + """A good algorithm that can take weights and outputs them as the 'mask' key.""" + return np.ones_like(data), {'a': 1, 'mask': np.ones_like(data)} + + @staticmethod + def bad_weights_func(data, x_data=None, weights=None, **kwargs): + """An algorithm that incorrectly uses weights.""" + return np.ones_like(data), {'a': 1, 'weights': np.arange(len(data))} + + @staticmethod + def bad_weights_func_no_weights(data, x_data=None, weights=None, **kwargs): + """An algorithm that does not include weights in the output parameters.""" + return np.ones_like(data), {'a': 1} + @staticmethod def change_y(data, x_data=None): """Changes the input data values, which is unwanted.""" @@ -148,10 +180,11 @@ def different_x_ordering(data=None, x_data=None): class DummyAlgorithm: - """A dummy object to serve as a fake Algorithm subclass.""" + """A dummy object to serve as a fake Algorithm and Algorithm2D subclass.""" - def __init__(self, x_data=None, *args, **kwargs): + def __init__(self, x_data=None, z_data=None, *args, **kwargs): self.x = x_data + self.z = z_data self.calls = 0 @dummy_wrapper @@ -171,6 +204,41 @@ def good_poly_func(self, data, return_coef=False, **kwargs): data=data, x_data=self.x, return_coef=return_coef, **kwargs ) + @dummy_wrapper + def bad_poly_func(self, data, return_coef=False, **kwargs): + """A bad polynomial algorithm.""" + return DummyModule.bad_poly_func( + data=data, x_data=self.x, return_coef=return_coef, **kwargs + ) + + @dummy_wrapper + def good_weights_func(self, data, weights=None, **kwargs): + """A good algorithm that can take weights.""" + return DummyModule.good_weights_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def good_mask_func(self, data, weights=None, **kwargs): + """A good algorithm that can take weights and outputs them as the 'mask' key.""" + return DummyModule.good_mask_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def bad_weights_func(self, data, weights=None, **kwargs): + """An algorithm that incorrectly uses weights.""" + return DummyModule.bad_weights_func( + data=data, x_data=self.x, weights=weights, **kwargs + ) + + @dummy_wrapper + def bad_weights_func_no_weights(self, data, weights=None, **kwargs): + """An algorithm that does not include weights in the output parameters.""" + return DummyModule.bad_weights_func_no_weights( + data=data, x_data=self.x, weights=weights, **kwargs + ) + @dummy_wrapper def change_y(self, data): """Changes the input data values, which is unwanted.""" @@ -180,7 +248,13 @@ def change_y(self, data): @dummy_wrapper def change_x(self, data): """Changes the input x-data values, which is unwanted.""" - self.x[0] = 200000 + self.x[0] = self.x[0] + 5 + return data, {} + + @dummy_wrapper + def change_z(self, data): + """Changes the input x-data values, which is unwanted.""" + self.z[0] += 5 return data, {} @dummy_wrapper @@ -268,6 +342,24 @@ def different_x_ordering(self, data=None): """Gives different output depending on the x-value sorting.""" return data[np.argsort(self.x)], {} + @dummy_wrapper + def different_z_ordering(self, data=None): + """Gives different output depending on the z-value sorting.""" + return data[(..., np.argsort(self.z))], {} + + @dummy_wrapper + def different_xz_output(self, data=None): + """Gives different output depending on the x-values and z-values.""" + if self.x is None or self.z is None: + return data, {} + else: + return 10 * data, {} + + @dummy_wrapper + def different_xz_ordering(self, data=None): + """Gives different output depending on the x-value and z-value sorting.""" + return data[np.argsort(self.x)[:, None], np.argsort(self.z)[None, :]], {} + class TestBaseTesterWorks(BaseTester): """Ensures a basic subclass of BaseTester works.""" @@ -289,10 +381,20 @@ def test_setup(self): assert callable(self.class_func) assert self.kwargs == {'key': 1} assert self.param_keys == ['a'] + assert not self.two_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1]) class TestBaseTesterWorks2d(BaseTester): - """Ensures a basic subclass of BaseTester works for a two dimensional algorithm.""" + """ + Ensures a basic subclass of BaseTester works for a two dimensional algorithm. + + Note: this is for one dimensional algorithms that take two dimensional data, not + for two dimensional algorithms. + """ module = DummyModule algorithm_base = DummyAlgorithm @@ -306,6 +408,12 @@ def test_setup(self): assert_allclose(self.y, np.vstack((expected_y, expected_y)), rtol=1e-14, atol=1e-14) assert self.kwargs == {} assert self.param_keys == [] + assert self.two_d + + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1]) class TestBaseTesterFailures(BaseTester): @@ -477,3 +585,287 @@ class TestBasePolyTesterWorks(BasePolyTester): algorithm_base = DummyAlgorithm func_name = 'good_poly_func' checked_keys = ['a'] + + +class TestBasePolyTesterFailures(BasePolyTester): + """Tests the various BasePolyTester methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'bad_poly_func' + checked_keys = ['a'] + + @pytest.mark.parametrize('return_coef', (True, False)) + def test_output(self, return_coef): + """Ensures failure if the coefficients are not correctly returned.""" + with pytest.raises(AssertionError): + super().test_output(return_coef=return_coef) + + def test_output_coefs(self): + """Ensures failure if the coefficients cannot recreate the output baseline.""" + with pytest.raises(AssertionError): + super().test_output_coefs() + + +class TestInputWeightsMixinWorks(BaseTester, InputWeightsMixin): + """Ensures a basic subclass of InputWeightsMixin works.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_weights_func' + checked_keys = ['a', 'weights'] + required_kwargs = {'key': 1} + + @contextmanager + def set_func(self, func_name, checked_keys=None, weight_key=None): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + original_weight_key = self.weight_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.weight_keys = weight_key + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.weight_keys = original_weight_key + self.__class__.setup_class() + + def test_input_weights(self): + """Ensures weight testing works for different weight keys in the parameter dictionary.""" + super().test_input_weights() + with self.set_func('good_mask_func', weight_key=('mask',), checked_keys=('a', 'mask')): + super().test_input_weights() + + +class TestInputWeightsMixinFails(BaseTester, InputWeightsMixin): + """Tests the various BasePolyTester methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'bad_weights_func' + checked_keys = ['a', 'weights'] + required_kwargs = {'key': 1} + + @contextmanager + def set_func(self, func_name, checked_keys=None, weight_key=('weights',)): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + original_weight_key = self.weight_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.weight_keys = weight_key + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.weight_keys = original_weight_key + self.__class__.setup_class() + + def test_input_weights(self): + """Ensures weight testing works for different weight keys in the parameter dictionary.""" + with pytest.raises(AssertionError): + super().test_input_weights() + + def test_has_no_weights(self): + """Ensures failure occurs if the weight key is not present in the parameter dictionary.""" + with self.set_func('bad_weights_func_no_weights', checked_keys=('a',)): + with pytest.raises(AssertionError): + super().test_input_weights() + + +class TestBaseTester2DWorks(BaseTester2D): + """Ensures a basic subclass of BaseTester2D works.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_func' + checked_keys = ['a'] + required_kwargs = {'key': 1} + + def test_setup(self): + """Ensures the `setup_class` class method is done correctly.""" + expected_x, expected_z, expected_y = get_data2d() + assert_allclose(self.x, expected_x, rtol=1e-14, atol=1e-14) + assert_allclose(self.z, expected_z, rtol=1e-14, atol=1e-14) + assert_allclose(self.y, expected_y, rtol=1e-14, atol=1e-14) + assert issubclass(self.algorithm_base, DummyAlgorithm) + assert isinstance(self.algorithm, DummyAlgorithm) + assert callable(self.class_func) + assert self.kwargs == {'key': 1} + assert self.param_keys == ['a'] + assert not self.three_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1, ::-1]) + + +class TestBaseTester2DWorks3d(BaseTester2D): + """ + Ensures a basic subclass of BaseTester works for a two dimensional algorithm. + + Note: this is for two dimensional algorithms that take three dimensional data, not + for three dimensional algorithms. + """ + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'good_func2' + three_d = True + + def test_setup(self): + """Ensures the `setup_class` class method is done correctly.""" + expected_x, expected_z, expected_y = get_data2d() + assert_allclose(self.x, expected_x, rtol=1e-14, atol=1e-14) + assert_allclose(self.z, expected_z, rtol=1e-14, atol=1e-14) + assert_allclose(self.y, np.array((expected_y, expected_y)), rtol=1e-14, atol=1e-14) + assert self.kwargs == {} + assert self.param_keys == [] + assert self.three_d + + def test_reverse_array(self): + """Ensures the reverse_array funcion works correctly.""" + assert_allclose(self.reverse_array(self.y), self.y[..., ::-1, ::-1]) + + +class TestBaseTester2DFailures(BaseTester2D): + """Tests the various BaseTester2D methods for functions with incorrect output.""" + + module = DummyModule + algorithm_base = DummyAlgorithm + func_name = 'no_func' + + @contextmanager + def set_func(self, func_name, checked_keys=None): + """Temporarily sets a new function for the class.""" + original_name = self.func_name + original_keys = self.param_keys + try: + self.__class__.func_name = func_name + self.__class__.checked_keys = checked_keys + self.__class__.setup_class() + yield self + finally: + self.__class__.func_name = original_name + self.__class__.checked_keys = original_keys + self.__class__.setup_class() + + def test_ensure_wrapped(self): + """Ensures no wrapper fails.""" + with self.set_func('no_wrapper'): + with pytest.raises(AssertionError): + super().test_ensure_wrapped() + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('func', ('change_x', 'change_y', 'change_z')) + def test_unchanged_data(self, new_instance, func): + """Ensures changing the x and y data fails.""" + with self.set_func(func): + with pytest.raises(AssertionError): + super().test_unchanged_data(new_instance) + + def test_repeated_fits(self): + """Ensures no wrapper fails.""" + with self.set_func('repitition_changes'): + with pytest.raises(AssertionError): + super().test_repeated_fits() + + def test_list_input(self): + """Ensures test fails when func gives different outputs for different input types.""" + with self.set_func('different_output'): + with pytest.raises(AssertionError): + super().test_list_input() + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures failure occurs when output changes when no x or z is given.""" + if has_x and has_z: + return # the one test case that would not produce any difference, so just skip + with self.set_func('different_xz_output'): + with pytest.raises(AssertionError): + super().test_no_xz(has_x, has_z) + + def test_output(self): + """Ensures failure occurs when the output is not correct.""" + with self.set_func('single_output'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_list'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_nondict'): + with pytest.raises(AssertionError): + super().test_output() + + with self.set_func('output_wrong_shape'): + with pytest.raises(AssertionError): + super().test_output() + + # also ensure keys are checked + with self.set_func('good_func'): + with pytest.raises(AssertionError): + super().test_output() + with pytest.raises(AssertionError): + super().test_output(additional_keys=['b', 'c']) + + with self.set_func('good_func', checked_keys=('a', 'b')): + with pytest.raises(AssertionError): + super().test_output() + + @pytest.mark.parametrize('func', + ('different_x_ordering', 'different_z_ordering', 'different_xz_ordering') + ) + def test_xz_ordering(self, func): + """Ensures failure when output is dependent on x-value sorting.""" + with self.set_func(func): + with pytest.raises(AssertionError): + super().test_xz_ordering() + + +class TestBaseTester2DNoFunc(BaseTester2D): + """Ensures the BaseTester2D fails if not setup correctly.""" + + @pytest.mark.parametrize('new_instance', (True, False)) + def test_unchanged_data(self, new_instance): + """Ensures that input data is unchanged by the function.""" + with pytest.raises(NotImplementedError): + super().test_unchanged_data(new_instance) + + def test_repeated_fits(self): + """Ensures the setup is properly reset when using class api.""" + with pytest.raises(NotImplementedError): + super().test_repeated_fits() + + def test_list_input(self): + """Ensures that function works the same for both array and list inputs.""" + with pytest.raises(NotImplementedError): + super().test_list_input() + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures that function output is the same when no x or z is input.""" + if has_x and has_z: + return # the one test case that would not produce any difference, so just skip + with pytest.raises(NotImplementedError): + super().test_no_xz(has_x, has_z) + + def test_output(self): + """Ensures that the output has the desired format.""" + with pytest.raises(NotImplementedError): + super().test_output() + + def test_xz_ordering(self): + """Ensures arrays are correctly sorted within the function.""" + with pytest.raises(NotImplementedError): + super().test_xz_ordering() diff --git a/tests/test_misc.py b/tests/test_misc.py index 2b31b90..35759c0 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -11,9 +11,10 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.sparse import dia_matrix, diags, spdiags, vstack +from scipy.sparse import vstack from pybaselines import _banded_utils, misc +from pybaselines._compat import dia_object, diags from .conftest import BaseTester, get_data @@ -162,7 +163,7 @@ def test_array_lam_fails(self): def test_banded_dot_vector(): """Ensures the dot product of a banded matrix and a vector is correct.""" # random, square, non-symmetric banded matrix - matrix_1 = dia_matrix(np.array([ + matrix_1 = dia_object(np.array([ [0, 1, 0, 0, 0], [1, 3, 4, 0, 0], [2, 4, 9, 8, 0], @@ -175,11 +176,11 @@ def test_banded_dot_vector(): banded_output_1 = misc._banded_dot_vector( bands_1, vector_1, (3, 1), matrix_1.shape ) - assert_array_equal(banded_output_1, matrix_1 * vector_1) + assert_array_equal(banded_output_1, matrix_1 @ vector_1) # random, square, symmetric banded matrix - matrix_2 = dia_matrix(np.array([ - [0, 1, 22, 0, 0, 0.0], + matrix_2 = dia_object(np.array([ + [0, 1., 22, 0, 0, 0.0], [1, 3, 4, 5, 0, 0], [22, 4, 9, 97, -3, 0], [0, 5, 97, -4, 19, 12], @@ -192,16 +193,16 @@ def test_banded_dot_vector(): banded_output_2 = misc._banded_dot_vector( bands_2, vector_2, (2, 2), matrix_2.shape ) - assert_allclose(banded_output_2, matrix_2 * vector_2, rtol=1e-11) + assert_allclose(banded_output_2, matrix_2 @ vector_2, rtol=1e-11) def test_banded_dot_banded(): """Ensures the dot product of two square banded matrices is correct.""" # random, square, non-symmetric banded matrix; tests that the number of upper and # lower diagonals in the output is capped by the shape of the matrix rather than the - # number of diagonals, since matrix_1 * matrix_1 would otherwise have more diagonals + # number of diagonals, since matrix_1 @ matrix_1 would otherwise have more diagonals # than allowed in the shape - matrix_1 = dia_matrix(np.array([ + matrix_1 = dia_object(np.array([ [0, 1, 0, 0, 0], [1, 3, 4, 0, 0], [2, 4, 9, 8, 0], @@ -210,14 +211,14 @@ def test_banded_dot_banded(): ])) bands_1 = matrix_1.todia().data[::-1] - actual_output_1 = (matrix_1 * matrix_1).todia().data[::-1] + actual_output_1 = (matrix_1 @ matrix_1).todia().data[::-1] banded_output_1 = misc._banded_dot_banded( bands_1, bands_1, (3, 1), (3, 1), matrix_1.shape, matrix_1.shape ) assert_array_equal(banded_output_1, actual_output_1) # random, square, symmetric banded matrix - matrix_2 = dia_matrix(np.array([ + matrix_2 = dia_object(np.array([ [0, 1, 22, 0, 0, 0], [1, 3, 4, 5, 0, 0], [22, 4, 9, 97, -3, 0], @@ -227,13 +228,13 @@ def test_banded_dot_banded(): ])) bands_2 = matrix_2.todia().data[::-1] - actual_output_2 = (matrix_2 * matrix_2).todia().data[::-1] + actual_output_2 = (matrix_2 @ matrix_2).todia().data[::-1] banded_output_2 = misc._banded_dot_banded( bands_2, bands_2, (2, 2), (2, 2), matrix_2.shape, matrix_2.shape ) assert_array_equal(banded_output_2, actual_output_2) - # also test symmetric_output=True since matrix_2 * matrix_2 is also symmetric + # also test symmetric_output=True since matrix_2 @ matrix_2 is also symmetric banded_output_3 = misc._banded_dot_banded( bands_2, bands_2, (2, 2), (2, 2), matrix_2.shape, matrix_2.shape, True ) @@ -289,8 +290,8 @@ def test_high_pass_filter_simple(filter_type): [0., 4.58885438, 10.35541753, 27.53312629, 10.35541753], [0., 0., 4.58885438, 10.35541753, 27.53312629] ]) - desired_A_banded = dia_matrix(desired_A_full).data[::-1] - desired_B_banded = dia_matrix(desired_B_full).data[::-1] + desired_A_banded = dia_object(desired_A_full).data[::-1] + desired_B_banded = dia_object(desired_B_full).data[::-1] A_sparse, B_sparse = misc._high_pass_filter(num_points, freq_cutoff, filter_type, True) A_banded, B_banded = misc._high_pass_filter(num_points, freq_cutoff, filter_type, False) @@ -392,15 +393,15 @@ def beads_data(): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): """ - Check that the lam * (D.T * Lam * D) and A.T * M * A calculations are correct. + Check that the lam * (D.T @ Lam @ D) and A.T @ M @ A calculations are correct. D is the stacked first and second order difference matrices, Lam is a diagonal matrix, - and lam is a scalar. M is the output of Gamma + lam * (D.T * Lam * D), and can let + and lam is a scalar. M is the output of Gamma + lam * (D.T @ Lam @ D), and can let Gamma just be 0 for the test. - The actual calculation for D.T * Lam * D uses just the banded structure, which allows + The actual calculation for D.T @ Lam @ D uses just the banded structure, which allows using arrays rather than having to use and update three separate sparse matrices (the - full calculation is Gamma + D.T * Lam * D, where both Gamma and Lam are sparse matrices + full calculation is Gamma + D.T @ Lam @ D, where both Gamma and Lam are sparse matrices with one diagonal that gets updated each iteration), which is much faster and has no significant effect on memory. @@ -420,23 +421,23 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): d_y = np.concatenate((d1_y, d2_y)) diff_matrix = vstack((diff_1_matrix, diff_2_matrix)) # the full difference matrix, D - # D.T * diags(weight_function(derivative of y)) * D, + # D.T @ diags(weight_function(derivative of y)) @ D, # let weight_function(d_y) just return d_y since it doesn't matter. # the calculation as written in the beads paper (see docstring of beads function for reference) true_calculation = ( - lam_1 * diff_1_matrix.T * diags(d1_y) * diff_1_matrix - + lam_2 * diff_2_matrix.T * diags(d2_y) * diff_2_matrix + lam_1 * diff_1_matrix.T @ diags(d1_y) @ diff_1_matrix + + lam_2 * diff_2_matrix.T @ diags(d2_y) @ diff_2_matrix ) # the calculation as written in the MATLAB beads function, puts lam_1 and lam_2 within Lam - matlab_calculation = diff_matrix.T * diags(lam_12_array * d_y) * diff_matrix + matlab_calculation = diff_matrix.T @ diags(lam_12_array * d_y) @ diff_matrix assert_allclose(true_calculation.toarray(), matlab_calculation.toarray()) # now do the same calculation, using the banded matrices diff_1_banded = np.zeros((5, num_points)) diff_2_banded = np.zeros((5, num_points)) - # D.T * L * D == D_1.T * L_1 * D_1 + D_2.T * L_2 + D_2, so can calculate the + # D.T @ L @ D == D_1.T @ L_1 @ D_1 + D_2.T @ L_2 @ D_2, so can calculate the # individual differences separately d1_y_output, d2_y_output = misc._abs_diff(y) diff_1_banded[1][1:] = diff_1_banded[3][:-1] = -d1_y_output @@ -456,15 +457,17 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): assert_allclose(matlab_calculation.todia().data[::-1], banded_calculation) - # now test calculation of A.T * M * A where A is the D.T * Lam * D results - ATMA_actual = A.T * true_calculation * A + # now test calculation of A.T @ M @ A where A is the D.T @ Lam @ D results + ATMA_actual = A.T @ true_calculation @ A ATMA_actual_bands = ATMA_actual.todia().data[::-1] - sparse_DTD = spdiags(banded_calculation, np.arange(2, -3, -1), num_points, num_points) + sparse_DTD = dia_object( + (banded_calculation, np.arange(2, -3, -1)), shape=(num_points, num_points) + ) - assert_allclose(ATMA_actual.toarray(), (A.T * sparse_DTD * A).toarray()) + assert_allclose(ATMA_actual.toarray(), (A.T @ sparse_DTD @ A).toarray()) # also check without tranposing A since A is symmetric and that's what is used in pybaselines - assert_allclose(ATMA_actual.toarray(), (A * sparse_DTD * A).toarray()) + assert_allclose(ATMA_actual.toarray(), (A @ sparse_DTD @ A).toarray()) # now check banded result; banded calculation also uses A instead of A.T ATMA_banded = misc._banded_dot_banded( @@ -493,7 +496,7 @@ def test_beads_diff_matrix_calculation(beads_data, filter_type, freq_cutoff): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_BTB(beads_data, filter_type, freq_cutoff): """ - Check that B.T * B calculation is correct for sparse and banded matrices. + Check that B.T @ B calculation is correct for sparse and banded matrices. The calculation used in pybaselines does not use the tranpose of B since it should be symmetric. @@ -504,11 +507,11 @@ def test_beads_BTB(beads_data, filter_type, freq_cutoff): A, B = misc._high_pass_filter(num_points, freq_cutoff, filter_type, True) A_banded, B_banded = misc._high_pass_filter(num_points, freq_cutoff, filter_type, False) - # check that B.T * B is the same as B * B since B is symmetric - actual_BTB = B.T * B + # check that B.T @ B is the same as B @ B since B is symmetric + actual_BTB = B.T @ B actual_BTB_banded = actual_BTB.todia().data[::-1] - assert_allclose(actual_BTB.toarray(), (B * B).toarray()) + assert_allclose(actual_BTB.toarray(), (B @ B).toarray()) banded_BTB = misc._banded_dot_banded( B_banded, B_banded, (filter_type, filter_type), (filter_type, filter_type), @@ -531,7 +534,7 @@ def test_beads_BTB(beads_data, filter_type, freq_cutoff): @pytest.mark.parametrize('freq_cutoff', (0.49, 0.01, 0.001)) def test_beads_ATb(beads_data, filter_type, freq_cutoff): """ - Check that the lam_0 * A.T * b calculation is correct. + Check that the lam_0 * A.T @ b calculation is correct. The calculation used in pybaselines does not use the tranpose of A since it should be symmetric, and it puts lam_0 into b to skip a multiplication step. @@ -544,11 +547,11 @@ def test_beads_ATb(beads_data, filter_type, freq_cutoff): fill_value = -5 b = np.full(num_points, fill_value) - # first just check A.T * b - ATb_actual = A.T * b + # first just check A.T @ b + ATb_actual = A.T @ b # check that the tranpose is unnessesary since A is symmetric - assert_allclose(ATb_actual, A * b) + assert_allclose(ATb_actual, A @ b) # check the banded solution ATb_banded = misc._banded_dot_vector( @@ -558,14 +561,14 @@ def test_beads_ATb(beads_data, filter_type, freq_cutoff): # use rtol=1.5e-7 with an atol since values are very small for d=2 and small freq_cutoff assert_allclose(ATb_actual, ATb_banded, rtol=1.5e-7, atol=1e-14) - # now check lam_0 * A.T * b - lam_ATb_actual = lam_0 * A.T * b + # now check lam_0 * A.T @ b + lam_ATb_actual = lam_0 * A.T @ b # actual calculation places lam_0 in the vector so that an additional # multiplication step can be skipped b_2 = np.full(num_points, lam_0 * fill_value) - assert_allclose(lam_ATb_actual, A * b_2) + assert_allclose(lam_ATb_actual, A @ b_2) # check the banded solution lam_ATb_banded = misc._banded_dot_vector( diff --git a/tests/test_polynomial.py b/tests/test_polynomial.py index 196d29c..addbb91 100644 --- a/tests/test_polynomial.py +++ b/tests/test_polynomial.py @@ -15,7 +15,7 @@ from pybaselines import polynomial from pybaselines.utils import ParameterWarning -from .conftest import BasePolyTester, InputWeightsMixin, get_data +from .conftest import BasePolyTester, InputWeightsMixin from .data import ( LOESS_X, LOESS_Y, QUANTILE_Y, STATSMODELS_LOESS_DELTA, STATSMODELS_LOESS_ITER, STATSMODELS_QUANTILES @@ -209,7 +209,6 @@ class TestLoess(IterativePolynomialTester): @pytest.mark.parametrize('use_threshold', (True, False)) def test_unchanged_data(self, use_class, use_threshold, conserve_memory, delta): """Ensures that input data is unchanged by the function.""" - x, y = get_data() super().test_unchanged_data( use_class, use_threshold=use_threshold, conserve_memory=conserve_memory, delta=delta @@ -393,7 +392,7 @@ def test_outside_quantile_fails(self, quantile): with pytest.raises(ValueError): self.class_func(self.y, quantile=quantile) - @pytest.mark.parametrize('quantile', [0.1, 0.5, 0.9]) + @pytest.mark.parametrize('quantile', tuple(STATSMODELS_QUANTILES.keys())) def test_compare_to_statsmodels(self, quantile): """ Compares the output of quant_reg to statsmodels's quantile regression implementation. @@ -460,7 +459,6 @@ class TestGoldindec(PolynomialTester): ) def test_unchanged_data(self, use_class, cost_function): """Ensures that input data is unchanged by the function.""" - x, y = get_data() super().test_unchanged_data(use_class, cost_function=cost_function) @pytest.mark.parametrize('cost_function', ('p_huber', '')) diff --git a/tests/test_spline.py b/tests/test_spline.py index 278d884..28c5511 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -11,9 +11,9 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy import integrate from pybaselines import _banded_utils, morphological, spline, utils, whittaker +from pybaselines._compat import trapezoid from .conftest import BaseTester, InputWeightsMixin @@ -105,15 +105,10 @@ def test_mixture_pdf(fraction_pos, fraction_neg): + fraction_neg * neg_uniform ) - assert_allclose(expected_pdf, output_pdf, 1e-12, 1e-12) + assert_allclose(expected_pdf, output_pdf, rtol=1e-12, atol=1e-12) # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited # by number of x-values - - if hasattr(integrate, 'trapezoid'): - trapezoid = integrate.trapezoid - else: - trapezoid = integrate.trapz - assert_allclose(1.0, trapezoid(output_pdf, x), 1e-3) + assert_allclose(1.0, trapezoid(output_pdf, x), rtol=1e-3, atol=1e-10) def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, @@ -262,9 +257,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.asls, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.asls, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineIAsLS(IterativeSplineTester): @@ -288,11 +286,20 @@ def test_outside_p_fails(self, p): with pytest.raises(ValueError): self.class_func(self.y, p=p) + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (2, 3)) + @pytest.mark.parametrize('lam_1', (1e1, 1e3)) + def test_whittaker_comparison(self, lam, lam_1, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.iasls, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.iasls, self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order + ) class TestPsplineAirPLS(IterativeSplineTester): @@ -330,9 +337,10 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.airpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.airpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineArPLS(IterativeSplineTester): @@ -346,6 +354,7 @@ def test_diff_orders(self, diff_order): lam = {1: 1e2, 3: 1e10}[diff_order] self.class_func(self.y, lam=lam, diff_order=diff_order) + @pytest.mark.skip(reason='overflow will be addressed next version') def test_avoid_overflow_warning(self, no_noise_data_fixture): """ Ensures no warning is emitted for exponential overflow. @@ -365,9 +374,10 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.arpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.arpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineDrPLS(IterativeSplineTester): @@ -410,14 +420,17 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('eta', (0.2, 0.8)) - def test_whittaker_comparison(self, lam, eta): + @pytest.mark.parametrize('diff_order', (2, 3)) + def test_whittaker_comparison(self, lam, eta, diff_order): """ Ensures the P-spline version is the same as the Whittaker version. Have to use a larger tolerance since pspline_drpls uses interpolation to get the weight at the coefficients' x-values. """ - compare_pspline_whittaker(self, whittaker.drpls, self.y, lam=lam, eta=eta, test_rtol=2e-3) + compare_pspline_whittaker( + self, whittaker.drpls, self.y, lam=lam, eta=eta, diff_order=diff_order, test_rtol=2e-3 + ) @pytest.mark.parametrize('eta', (-1, 2)) def test_outside_eta_fails(self, eta): @@ -425,6 +438,11 @@ def test_outside_eta_fails(self, eta): with pytest.raises(ValueError): self.class_func(self.y, eta=eta) + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + class TestPsplineIArPLS(IterativeSplineTester): """Class for testing pspline_iarpls baseline.""" @@ -465,9 +483,10 @@ def test_avoid_nonfinite_weights(self, no_noise_data_fixture): assert not np.isfinite(params['tol_history'][-1]) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.iarpls, self.y, lam=lam) + compare_pspline_whittaker(self, whittaker.iarpls, self.y, lam=lam, diff_order=diff_order) class TestPsplineAsPLS(IterativeSplineTester): @@ -508,14 +527,21 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) @pytest.mark.parametrize('lam', (1e1, 1e5)) - def test_whittaker_comparison(self, lam): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, diff_order): """ Ensures the P-spline version is the same as the Whittaker version. Have to use a larger tolerance since pspline_aspls uses interpolation to get the alpha values at the coefficients' x-values. """ - compare_pspline_whittaker(self, whittaker.aspls, self.y, lam=lam, test_rtol=2e-3) + if diff_order == 2: + rtol = 2e-3 + else: + rtol = 5e-2 + compare_pspline_whittaker( + self, whittaker.aspls, self.y, lam=lam, diff_order=diff_order, test_rtol=rtol + ) class TestPsplinePsalsa(IterativeSplineTester): @@ -537,9 +563,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.psalsa, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.psalsa, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineDerpsalsa(IterativeSplineTester): @@ -561,9 +590,12 @@ def test_diff_orders(self, diff_order): @pytest.mark.parametrize('lam', (1e1, 1e5)) @pytest.mark.parametrize('p', (0.01, 0.1)) - def test_whittaker_comparison(self, lam, p): + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_whittaker_comparison(self, lam, p, diff_order): """Ensures the P-spline version is the same as the Whittaker version.""" - compare_pspline_whittaker(self, whittaker.derpsalsa, self.y, lam=lam, p=p) + compare_pspline_whittaker( + self, whittaker.derpsalsa, self.y, lam=lam, p=p, diff_order=diff_order + ) class TestPsplineMPLS(SplineTester, InputWeightsMixin): diff --git a/tests/test_spline_utils.py b/tests/test_spline_utils.py index 85573bb..3f0c53e 100644 --- a/tests/test_spline_utils.py +++ b/tests/test_spline_utils.py @@ -12,10 +12,11 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.interpolate import BSpline, splev -from scipy.sparse import diags, issparse, spdiags +from scipy.sparse import issparse from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils +from pybaselines._compat import diags, dia_object def _nieve_basis_matrix(x, knots, spline_degree): @@ -230,10 +231,10 @@ def test_solve_psplines(data_fixture, num_knots, spline_degree, diff_order, lowe basis = _spline_utils._spline_basis(x, knots, spline_degree) num_bases = basis.shape[1] penalty = _banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only) - penalty_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(num_bases, diff_order, False), - np.arange(diff_order, -(diff_order + 1), -1), num_bases, num_bases, 'csr' - ) + penalty_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(num_bases, diff_order, False), + np.arange(diff_order, -(diff_order + 1), -1)), shape=(num_bases, num_bases) + ).tocsr() expected_coeffs = spsolve( basis.T @ diags(weights, format='csr') @ basis + penalty_matrix, @@ -421,7 +422,7 @@ def test_pspline_tck_none(data_fixture): assert pspline.coef is None with pytest.raises(ValueError): - pspline.tck + tck = pspline.tck def test_pspline_tck_readonly(data_fixture): diff --git a/tests/test_utils.py b/tests/test_utils.py index b938745..2ec23c2 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,10 +10,10 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.interpolate import BSpline -from scipy.sparse import diags, identity, spdiags from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils, utils +from pybaselines._compat import diags, dia_object, identity from .conftest import gaussian @@ -21,7 +21,13 @@ @pytest.fixture(scope='module') def _x_data(): """x-values for testing.""" - return np.linspace(-20, 20) + return np.linspace(-20, 20, 50) + + +@pytest.fixture(scope='module') +def _z_data(): + """z-values for testing.""" + return np.linspace(-10, 10, 30) @pytest.mark.parametrize('sigma', [0.1, 1, 10]) @@ -34,6 +40,12 @@ def test_gaussian(_x_data, height, center, sigma): gaussian(_x_data, height, center, sigma), 1e-12, 1e-12 ) +@pytest.mark.parametrize('sigma', (0, -1)) +def test_gaussian_non_positive_sigma(_x_data, sigma): + """Ensures a sigma value not greater than 0 raises an exception.""" + with pytest.raises(ValueError): + utils.gaussian(_x_data, sigma=sigma) + @pytest.mark.parametrize('window_size', (1, 20, 100)) @pytest.mark.parametrize('sigma', (1, 2, 5)) @@ -68,6 +80,33 @@ def test_gaussian_kernel_0_windowsize(data_fixture): assert_array_equal(y, out) +@pytest.mark.parametrize('sigma_x', [0.1, 1, 10]) +@pytest.mark.parametrize('center_x', [-10, 0, 10]) +@pytest.mark.parametrize('sigma_z', [0.1, 1, 10]) +@pytest.mark.parametrize('center_z', [-10, 0, 10]) +@pytest.mark.parametrize('height', [0.1, 1, 10]) +def test_gaussian2d(_x_data, _z_data, height, center_x, center_z, sigma_x, sigma_z): + """Ensures that gaussian2d function in pybaselines.utils is correct.""" + X, Z = np.meshgrid(_x_data, _z_data) + + expected = height * gaussian(X, 1, center_x, sigma_x) * gaussian(Z, 1, center_z, sigma_z) + assert_allclose( + utils.gaussian2d(X, Z, height, center_x, center_z, sigma_x, sigma_z), + expected, 1e-12, 1e-12 + ) + + +def test_gaussian2d_1d_raises(_x_data, _z_data): + """Ensures that gaussian2d function raises an error if the input is one dimensional.""" + X, Z = np.meshgrid(_x_data, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(_x_data, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(X, _z_data) + with pytest.raises(ValueError): + utils.gaussian2d(_x_data, Z) + + @pytest.mark.parametrize('sign', (1, -1)) def test_relative_difference_scalar(sign): """Tests relative_difference to ensure it uses abs for scalars.""" @@ -122,6 +161,25 @@ def test_interp_inplace(): assert_allclose(y_calc, y_actual, 1e-12) +@pytest.mark.parametrize('scale', (1., 10., 0.557)) +@pytest.mark.parametrize('num_coeffs', (1, 2, 5)) +def test_poly_transform_matrix(scale, num_coeffs): + """ + Tests the matrix that transforms polynomial coefficients from one domain to another. + + Only tests the simple cases where the offset is 0 since more complicated cases are + handled by the _convert_coef and _convert_coef2d tests. + """ + transform_matrix = np.eye(num_coeffs) + for i in range(num_coeffs): + transform_matrix[i, i] /= scale**i + + domain = np.array([-1, 1]) * scale + calc_matrix = utils._poly_transform_matrix(num_coeffs, domain) + + assert_allclose(calc_matrix, transform_matrix, atol=1e-12, rtol=1e-14) + + @pytest.mark.parametrize('x', (np.array([-5, -2, 0, 1, 8]), np.array([1, 2, 3, 4, 5]))) @pytest.mark.parametrize( 'coefs', ( @@ -146,6 +204,80 @@ def test_convert_coef(x, coefs): assert_allclose(converted_coefs, coefs, atol=1e-10) +@pytest.mark.parametrize('x', (np.linspace(-1, 1, 50), np.linspace(-13.5, 11.6, 51))) +@pytest.mark.parametrize('z', (np.linspace(-1, 1, 50), np.linspace(-13.5, 11.6, 51))) +@pytest.mark.parametrize( + 'coef', ( + np.array([ + [1, 0], + [1, 0] + ]), + np.array([ + [1, 1], + [0, 0] + ]), + np.array([ + [1, 0.1, 0.3, -0.5], + [1, 0.1, 0, 1], + [0.2, 0, 1.5, -0.3] + ]), + ) +) +def test_convert_coef2d(x, z, coef): + """ + Checks that polynomial coefficients are correctly converted to the original domain. + + Notes on the tested x and z values: Data from [-1, 1] has an offset of 0 and a scale + of 1, so the coefficients are unaffected, while the second set of values has an offset + not equal to 0 and a scale not equal to 1 so should be a good test of whether the + conversion is successful. + + """ + x_domain = np.polynomial.polyutils.getdomain(x) + mapped_x = np.polynomial.polyutils.mapdomain( + x, x_domain, np.array([-1., 1.]) + ) + z_domain = np.polynomial.polyutils.getdomain(z) + mapped_z = np.polynomial.polyutils.mapdomain( + z, z_domain, np.array([-1., 1.]) + ) + X, Z = np.meshgrid(x, z) + y = np.zeros_like(x) + for i in range(coef.shape[0]): + for j in range(coef.shape[1]): + y = y + coef[i, j] * X**i * Z**j + y_flat = y.ravel() + + vandermonde = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z), + (coef.shape[0] - 1, coef.shape[1] - 1) + ).reshape((-1, (coef.shape[0]) * (coef.shape[1]))) + + calc_coef = np.linalg.pinv(vandermonde) @ (y_flat) + calc_y = vandermonde @ calc_coef # corresponds to mapped domain + + # sanity check; use slightly higher atol than other checks since + # the fit can potentially be off by a bit + assert_allclose(calc_y, y_flat, rtol=1e-10, atol=1e-6) + + converted_coef = utils._convert_coef2d( + calc_coef, coef.shape[0] - 1, coef.shape[1] - 1, x_domain, z_domain + ) + + mapped_X, mapped_Z = np.meshgrid(mapped_x, mapped_z) + mapped_polynomial = np.polynomial.polynomial.polyval2d( + mapped_X, mapped_Z, calc_coef.reshape(coef.shape) + ) + + original_polynomial = np.polynomial.polynomial.polyval2d(X, Z, converted_coef) + + # sanity check that polyval2d recreates with the mapped coefficients + assert_allclose(mapped_polynomial, calc_y.reshape(y.shape), rtol=1e-10, atol=1e-14) + + assert_allclose(original_polynomial, mapped_polynomial, rtol=1e-10, atol=1e-14) + assert_allclose(converted_coef, coef, rtol=1e-10, atol=1e-12) + + @pytest.mark.parametrize('diff_order', (0, 1, 2, 3, 4, 5)) def test_difference_matrix(diff_order): """Tests common differential matrices.""" @@ -314,17 +446,21 @@ def test_pad_edges_extrapolate_windows(): input_array[-10:] = 1. extrapolate_windows = [40, 10] pad_len = 20 - output = utils.pad_edges(input_array, pad_len, extrapolate_window=extrapolate_windows) + output = utils.pad_edges( + input_array, pad_len, mode='extrapolate', extrapolate_window=extrapolate_windows + ) assert_allclose(output[:pad_len], np.full(pad_len, 0.), 1e-14) assert_allclose(output[-pad_len:], np.full(pad_len, 1.), 1e-14) -@pytest.mark.parametrize('extrapolate_window', (0, (0, 0), (5, 0), (5, -1))) +@pytest.mark.parametrize('extrapolate_window', (0, -2, (0, 0), (5, 0), (5, -1))) def test_pad_edges_extrapolate_zero_window(extrapolate_window): """Ensures an extrapolate_window <= 0 raises an exception.""" with pytest.raises(ValueError): - utils.pad_edges(np.arange(10), 10, extrapolate_window=extrapolate_window) + utils.pad_edges( + np.arange(10), 10, mode='extrapolate', extrapolate_window=extrapolate_window + ) @pytest.mark.parametrize('pad_mode', ('reflect', 'extrapolate')) @@ -352,7 +488,7 @@ def test_pad_edges_custom_pad_func(): actual_output = utils.pad_edges(input_array, pad_length, pad_func, pad_val=pad_val) - assert_array_equal(actual_output, expected_output) + assert_allclose(actual_output, expected_output, rtol=1e-12, atol=0) def test_get_edges_custom_pad_func(): @@ -408,6 +544,111 @@ def test_get_edges(pad_mode, pad_length, list_input, data_fixture): assert_allclose(right, expected_right) +@pytest.mark.parametrize( + 'pad_mode', ('reflect', 'REFLECT', 'extrapolate', 'edge', 'constant', pad_func) +) +@pytest.mark.parametrize('pad_length', (1, 2, 20, 53)) +@pytest.mark.parametrize('list_input', (False, True)) +def test_pad_edges2d(pad_mode, pad_length, list_input, data_fixture2d): + """Tests various inputs for utils.pad_edges2d.""" + *_, data = data_fixture2d + data_shape = data.shape + if list_input: + data = data.tolist() + + if not callable(pad_mode): + np_pad_mode = pad_mode.lower() + else: + np_pad_mode = pad_mode + if np_pad_mode != 'extrapolate': + expected_output = np.pad(data, pad_length, np_pad_mode) + else: + expected_output = None + + output = utils.pad_edges2d(data, pad_length, pad_mode) + assert isinstance(output, np.ndarray) + assert output.ndim == 2 + assert output.shape[0] == data_shape[0] + 2 * pad_length + assert output.shape[1] == data_shape[1] + 2 * pad_length + + if expected_output is not None: + assert_allclose(output, expected_output) + + +@pytest.mark.parametrize('pad_length', (0, 1, 2, 20, 53)) +@pytest.mark.parametrize('extrapolate_window', (None, 1, 2, 10, 1001, (10, 20), (1, 1))) +@pytest.mark.parametrize('list_input', (False, True)) +def test_pad_edges2d_extrapolate(pad_length, list_input, extrapolate_window, data_fixture2d): + """Ensures extrapolation works for utils.pad_edges.""" + *_, data = data_fixture2d + data_shape = data.shape + if list_input: + data = data.tolist() + + if np.less_equal(pad_length, 0).any(): + with pytest.raises(NotImplementedError): + utils.pad_edges2d(data, pad_length, 'extrapolate', extrapolate_window) + else: + output = utils.pad_edges2d(data, pad_length, 'extrapolate', extrapolate_window) + assert isinstance(output, np.ndarray) + assert output.shape[0] == data_shape[0] + 2 * pad_length + assert output.shape[1] == data_shape[1] + 2 * pad_length + + +def test_pad_edges2d_extrapolate_windows(): + """Ensures the separate extrapolate windows are correctly interpreted.""" + input_array = np.zeros(400).reshape(20, 20) + input_array[-10:] = 1. + extrapolate_windows = [5, 10] + pad_len = 5 + output = utils.pad_edges2d( + input_array, pad_len, mode='extrapolate', extrapolate_window=extrapolate_windows + ) + + assert_allclose( + output[:pad_len, pad_len:-pad_len], np.full((pad_len, input_array.shape[1]), 0.), 1e-14 + ) + assert_allclose( + output[-pad_len:, pad_len:-pad_len], np.full((pad_len, input_array.shape[1]), 1.), 1e-14 + ) + + +@pytest.mark.parametrize('extrapolate_window', (0, -2, (0, 0), (5, 0), (5, -1))) +def test_pad_edges2d_extrapolate_zero_window(small_data2d, extrapolate_window): + """Ensures an extrapolate_window <= 0 raises an exception.""" + with pytest.raises(ValueError): + utils.pad_edges2d( + small_data2d, 10, mode='extrapolate', extrapolate_window=extrapolate_window + ) + + +@pytest.mark.parametrize('pad_mode', ('reflect', 'extrapolate')) +def test_pad_edges2d_negative_pad_length(pad_mode, data_fixture2d): + """Ensures a negative pad length raises an exception.""" + with pytest.raises(ValueError): + utils.pad_edges2d(data_fixture2d[-1], -5, pad_mode) + + +def test_pad_edges2d_custom_pad_func(): + """Ensures pad_edges works with a callable padding function, same as numpy.pad.""" + input_array = np.arange(2000).reshape(50, 40) + pad_val = 20 + pad_length = 10 + + expected_output = np.empty( + (input_array.shape[0] + 2 * pad_length, input_array.shape[1] + 2 * pad_length) + ) + expected_output[:pad_length] = pad_val + expected_output[-pad_length:] = pad_val + expected_output[:, :pad_length] = pad_val + expected_output[:, -pad_length:] = pad_val + expected_output[pad_length:-pad_length, pad_length:-pad_length] = input_array + + actual_output = utils.pad_edges(input_array, pad_length, pad_func, pad_val=pad_val) + + assert_allclose(actual_output, expected_output, rtol=1e-12, atol=0) + + @pytest.mark.parametrize('seed', (123, 98765)) def test_invert_sort(seed): """Ensures the inverted sort works.""" @@ -422,6 +663,100 @@ def test_invert_sort(seed): assert_array_equal(values, values[sort_order][inverted_order]) +@pytest.mark.parametrize('needs_sorting', (True, False)) +def test_determine_sorts(needs_sorting): + """Ensures the sort and inverted sort determinations work.""" + data = np.linspace(-1, 1, 20) + original_data = data.copy() + if needs_sorting: + data[5:10] = data[5:10][::-1] + + sort_order, inverted_order = utils._determine_sorts(data) + if not needs_sorting: + assert sort_order is None + assert inverted_order is None + else: + assert_array_equal(data[sort_order], original_data) + assert_array_equal(sort_order, data.argsort(kind='mergesort')) + assert_array_equal(data[sort_order][inverted_order], data) + + +@pytest.mark.parametrize('two_d', (True, False)) +def test_sort_array_none(two_d): + """Tests the case where the sorting array is None, which should skip sorting.""" + data = np.linspace(-1, 1, 20) + if two_d: + data = data[None, :] + + assert_allclose(data, utils._sort_array(data, sort_order=None), atol=0, rtol=1e-14) + + +@pytest.mark.parametrize('two_d', (True, False)) +def test_sort_array(two_d): + """Ensures array sorting works with 1d arrays.""" + data = np.linspace(-1, 1, 20) + reversed_data = data[::-1] + sort_order = np.arange(len(data))[::-1] + if two_d: + data = np.array([data, data]) + reversed_data = np.array([reversed_data, reversed_data]) + + assert_allclose(data, utils._sort_array(reversed_data, sort_order), atol=0, rtol=1e-14) + + +@pytest.mark.parametrize('three_d', (True, False)) +def test_sort_array2d_none(three_d): + """Tests the case where the sorting array is None, which should skip sorting.""" + data = np.linspace(-1, 1, 20).reshape(5, 4) + if three_d: + data = data[None, :] + + assert_allclose(data, utils._sort_array2d(data, sort_order=None), atol=0, rtol=1e-14) + + +@pytest.mark.parametrize('sort_x', (True, False, None)) +@pytest.mark.parametrize('three_d', (True, False)) +def test_sort_array2d(three_d, sort_x): + """ + Ensures sorting for 2d data works. + + Each of the three `sort_x` cases corresponds to how _Algorithm2D will make its _sort_order + attribute if given only x, only z, and both x and z, respectively. + """ + x = np.linspace(-1, 1, 20) + z = np.linspace(-2, 2, 30) + x_sort_order = np.arange(len(x)) + z_sort_order = np.arange(len(z)) + + X, Z = np.meshgrid(x, z) + data = X + 2 * Z + + if sort_x is None: # sort both x and z, so reverse both x and z + x2 = x[::-1] + x_sort_order = x_sort_order[::-1] + z2 = z[::-1] + z_sort_order = z_sort_order[::-1] + sort_order = (z_sort_order[:, None], x_sort_order[None, :]) + elif sort_x: # sort just x, so reverse just x + x2 = x[::-1] + x_sort_order = x_sort_order[::-1] + z2 = z + sort_order = (..., x_sort_order) + else: # sort just z, so reverse just z + x2 = x + z2 = z[::-1] + z_sort_order = z_sort_order[::-1] + sort_order = z_sort_order + + X2, Z2 = np.meshgrid(x2, z2) + reversed_data = X2 + 2 * Z2 + if three_d: + data = np.array([data, data]) + reversed_data = np.array([reversed_data, reversed_data]) + + assert_allclose(data, utils._sort_array2d(reversed_data, sort_order), atol=0, rtol=1e-14) + + @pytest.mark.parametrize('diff_order', (1, 2, 3)) def test_whittaker_smooth(data_fixture, diff_order): """Ensures the Whittaker smoothing function performs correctly.""" @@ -460,10 +795,10 @@ def test_pspline_smooth(data_fixture, diff_order, num_knots, spline_degree): knots = _spline_utils._spline_knots(x, num_knots, spline_degree, True) basis = _spline_utils._spline_basis(x, knots, spline_degree) num_bases = basis.shape[1] - penalty_matrix = spdiags( - _banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only=False), - np.arange(diff_order, -(diff_order + 1), -1), num_bases, num_bases, 'csr' - ) + penalty_matrix = dia_object( + (_banded_utils.diff_penalty_diagonals(num_bases, diff_order, lower_only=False), + np.arange(diff_order, -(diff_order + 1), -1)), shape=(num_bases, num_bases) + ).tocsr() weights = diags(np.ones(len_y), format='csr') # solve the simple case for all weights are 1 @@ -478,3 +813,18 @@ def test_pspline_smooth(data_fixture, diff_order, num_knots, spline_degree): recreated_spline = BSpline(*tck)(x) assert_allclose(recreated_spline, output, rtol=1e-10) + + +@pytest.mark.parametrize('two_d', (True, False)) +def test_optimize_window(small_data2d, two_d): + """Ensures optimize_window has the correct outputs for the dimesions of the input.""" + data = small_data2d + if not two_d: + data = data.flatten() + + output = utils.optimize_window(data) + if two_d: + assert output.shape == (2,) + assert isinstance(output, np.ndarray) + else: + assert isinstance(output, int) diff --git a/tests/test_validation.py b/tests/test_validation.py index 13d9c7f..fd9caf1 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -7,7 +7,7 @@ """ import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_allclose, assert_array_equal import pytest from pybaselines import _validation @@ -34,7 +34,51 @@ def test_yx_arrays_no_x(small_data): y, x = _validation._yx_arrays(small_data) assert isinstance(x, np.ndarray) - assert_array_equal(x, np.linspace(-1., 1., y.shape[0])) + assert_allclose(x, np.linspace(-1., 1., y.shape[0]), rtol=1e-12, atol=1e-12) + assert isinstance(y, np.ndarray) + assert_allclose(y, small_data, rtol=1e-12, atol=1e-12) + +@pytest.mark.parametrize('array_enum', (0, 1)) +def test_yxz_arrays_output_array(data_fixture2d, array_enum): + """Ensures output y, x, and z are always numpy arrays and that x and z are not scaled.""" + x, z, y = data_fixture2d + if array_enum == 1: + x = x.tolist() + z = z.tolist() + y = y.tolist() + + y_out, x_out, z_out = _validation._yxz_arrays(y, x, z) + + assert isinstance(y_out, np.ndarray) + assert_allclose(y_out, y, rtol=1e-12, atol=1e-12) + assert isinstance(x_out, np.ndarray) + assert_allclose(x_out, x, rtol=1e-12, atol=1e-12) + assert isinstance(z_out, np.ndarray) + assert_allclose(z_out, z, rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('has_x', (True, False)) +@pytest.mark.parametrize('has_z', (True, False)) +def test_yx_arrays_no_xz(data_fixture2d, has_x, has_z): + """Ensures an x and/or z array are created if None is input.""" + x, z, y = data_fixture2d + if has_x: + expected_x = x + else: + x = None + expected_x = np.linspace(-1, 1, y.shape[0]) + if has_z: + expected_z = z + else: + z = None + expected_z = np.linspace(-1, 1, y.shape[1]) + y_out, x_out, z_out = _validation._yxz_arrays(y, x, z) + + assert_allclose(y_out, y) + assert isinstance(x_out, np.ndarray) + assert_allclose(x_out, expected_x, rtol=1e-12, atol=1e-12) + assert isinstance(z_out, np.ndarray) + assert_allclose(z_out, expected_z, rtol=1e-12, atol=1e-12) @pytest.mark.parametrize('ndim', (0, 1, 2)) @@ -165,11 +209,49 @@ def test_check_scalar_length_none(): _validation._check_scalar(data, desired_length=10000) +def test_check_scalar_variable_single(): + """Ensures _check_scalar_variable returns a float value for the simple 1d case.""" + value = 3.2 + + output = _validation._check_scalar_variable(value) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value]) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value])) + assert isinstance(output, float) + assert_allclose(output, value, rtol=0, atol=1e-14) + + +def test_check_scalar_variable_twod(): + """Ensures _check_scalar_variable returns a length 2 numpy array for the simple 2d case.""" + value = 3.2 + expected_output = np.array([value, value]) + + output = _validation._check_scalar_variable(value, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value], two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value]), two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable([value, value], two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_scalar_variable(np.array([value, value]), two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + @pytest.mark.parametrize('lam', (5, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]))) def test_check_lam(lam): """Ensures scalar lam values are correctly processed.""" output_lam = _validation._check_lam(lam) - assert output_lam == 5 + assert_allclose(output_lam, 5, rtol=0, atol=1e-14) def test_check_lam_failures(): @@ -184,10 +266,50 @@ def test_check_lam_failures(): _validation._check_lam(lam) # test that is allows zero if allow_zero is True - _validation._check_lam(0, True) + _validation._check_lam(0, allow_zero=True) for lam in range(-5, 0): with pytest.raises(ValueError): - _validation._check_lam(lam, True) + _validation._check_lam(lam, allow_zero=True) + + +@pytest.mark.parametrize( + 'lam', ( + 5, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]), + [5, 5], np.array([5, 5]) + ) + ) +def test_check_lam_twod(lam): + """Ensures scalar lam values are correctly processed.""" + output_lam = _validation._check_lam(lam, two_d=True) + assert_allclose(output_lam, np.array([5, 5]), rtol=0, atol=1e-14) + + +def test_check_lam_twod_allow_zero(): + """Ensures _check_lam allows zero for two dimensional inputs when allowed.""" + expected_output = np.array([0, 0]) + + output = _validation._check_lam(0, allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_lam([0, 0], allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + +@pytest.mark.parametrize('allow_zero', (True, False)) +def test_check_lam_twod_negative_failures(allow_zero): + """Ensures _check_lam works fails for larger than two dimensional inputs.""" + max_val = 0 if allow_zero else 1 + + # check scalar inputs + for lam in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_lam(lam, allow_zero=allow_zero, two_d=True) + + # check array-like inputs + for lam_1 in range(-5, max_val): + for lam_2 in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_lam([lam_1, lam_2], allow_zero=allow_zero, two_d=True) @pytest.mark.parametrize( @@ -215,12 +337,54 @@ def test_check_half_window_failures(): _validation._check_half_window(0, True) for half_window in range(-5, 0): with pytest.raises(ValueError): - _validation._check_half_window(half_window, True) + _validation._check_half_window(half_window, allow_zero=True) # fails due to non-integer input with pytest.raises(TypeError): _validation._check_half_window(5.01) +@pytest.mark.parametrize( + 'half_window', ( + 5, 5.0, [5], (5,), [[5]], np.array(5), np.array([5]), np.array([[5]]), + np.array([5, 5]), [5, 5], [5.0, 5.0] + ) +) +def test_check_half_window_twod(half_window): + """Ensures _check_half_window works for two dimensional inputs when allowed.""" + output_half_window = _validation._check_half_window(half_window, two_d=True) + assert_allclose(output_half_window, np.array([5, 5], dtype=np.intp)) + assert output_half_window.dtype == np.intp + + +def test_check_half_window_twod_allow_zero(): + """Ensures _check_half_window allows zero for two dimensional inputs when allowed.""" + expected_output = np.array([0, 0], dtype=np.intp) + + output = _validation._check_half_window(0, allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + output = _validation._check_half_window([0, 0], allow_zero=True, two_d=True) + assert_allclose(output, expected_output, rtol=0, atol=1e-14) + + +@pytest.mark.parametrize('allow_zero', (True, False)) +def test_check_half_window_twod_negative_failures(allow_zero): + """Ensures _check_half_window works fails for larger than two dimensional inputs.""" + max_val = 0 if allow_zero else 1 + + # check scalar inputs + for half_window in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_half_window(half_window, allow_zero=allow_zero, two_d=True) + + # check array-like inputs + for half_window_1 in range(-5, max_val): + for half_window_2 in range(-5, max_val): + with pytest.raises(ValueError): + _validation._check_half_window( + [half_window_1, half_window_2], allow_zero=allow_zero, two_d=True + ) + @pytest.mark.parametrize('list_input', (True, False)) def test_check_array_dtype(small_data, list_input): @@ -413,3 +577,24 @@ def test_optional_array_no_input(): assert isinstance(output, np.ndarray) assert_array_equal(output, np.ones(length)) + + +def test_get_row_col_values(): + """Ensures multiple inputs can work for _get_row_col_values.""" + assert_array_equal(_validation._get_row_col_values(1), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values(np.array(1)), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values(np.array([1])), [1, 1, 1, 1]) + assert_array_equal(_validation._get_row_col_values([1.1]), [1.1, 1.1, 1.1, 1.1]) + assert_array_equal(_validation._get_row_col_values([[1.1]]), [1.1, 1.1, 1.1, 1.1]) + assert_array_equal(_validation._get_row_col_values([1, 2]), [1, 1, 2, 2]) + assert_array_equal(_validation._get_row_col_values([[1], [2]]), [1, 1, 2, 2]) + assert_array_equal(_validation._get_row_col_values(np.array([1, 2, 3, 4])), [1, 2, 3, 4]) + assert_array_equal(_validation._get_row_col_values([1, 2, 3, 4]), [1, 2, 3, 4]) + assert_array_equal(_validation._get_row_col_values([[1, 2], [3, 4]]), [1, 2, 3, 4]) + + +@pytest.mark.parametrize('values', ([1, 2, 3], [1, 2, 3, 4, 5])) +def test_get_row_col_values_fails(values): + """Ensures _get_row_col_values raises an exception with incorrectly sized inputs..""" + with pytest.raises(ValueError): + _validation._get_row_col_values(values) diff --git a/tests/test_whittaker.py b/tests/test_whittaker.py index d6a5ad8..28297b2 100644 --- a/tests/test_whittaker.py +++ b/tests/test_whittaker.py @@ -13,6 +13,7 @@ import pytest from pybaselines import _banded_utils, whittaker +from pybaselines._compat import diags from pybaselines.utils import ParameterWarning from .conftest import BaseTester, InputWeightsMixin, has_pentapy @@ -278,6 +279,23 @@ def test_avoid_overflow_warning(self, no_noise_data_fixture): assert np.isfinite(baseline.dot(baseline)) + @pytest.mark.parametrize('diff_order', (1, 2, 3)) + def test_alpha_multiplication(self, diff_order): + """Ensures multiplication of the alpha array and banded penalty is handled correctly.""" + lam = 5. + num_points = len(self.y) + alpha = np.arange(num_points, dtype=float) + penalized_system = _banded_utils.PenalizedSystem( + num_points, lam=lam, diff_order=diff_order, allow_lower=False, reverse_diags=True + ) + penalty_matrix = lam * _banded_utils.diff_penalty_matrix(num_points, diff_order=diff_order) + + expected_result = (diags(alpha) @ penalty_matrix).todia().data[::-1] + + result = alpha * penalized_system.penalty + result = _banded_utils._shift_rows(result, diff_order, diff_order) + assert_allclose(result, expected_result, rtol=1e-13, atol=1e-13) + class TestPsalsa(WhittakerTester): """Class for testing psalsa baseline.""" diff --git a/tests/two_d/__init__.py b/tests/two_d/__init__.py new file mode 100644 index 0000000..0c8cac4 --- /dev/null +++ b/tests/two_d/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.""" diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py new file mode 100644 index 0000000..c472039 --- /dev/null +++ b/tests/two_d/test_algorithm_setup.py @@ -0,0 +1,996 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.two_d._algorithm_setup. + +@author: Donald Erb +Created on January 5, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.sparse import kron + +from pybaselines._compat import identity +from pybaselines.two_d import _algorithm_setup, optimizers, polynomial, whittaker +from pybaselines.utils import ParameterWarning, difference_matrix + +from ..conftest import get_data2d, get_2dspline_inputs + + +@pytest.fixture +def algorithm(small_data2d): + """ + An _Algorithm2D class with x-data set to np.arange(10) and z-data set to np.arange(20). + + Returns + ------- + pybaselines.two_d._algorithm_setup._Algorithm2D + An _Algorithm2D class for testing. + """ + num_x, num_z = small_data2d.shape + return _algorithm_setup._Algorithm2D( + x_data=np.arange(num_x), z_data=np.arange(num_z), assume_sorted=True, check_finite=False + ) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('lam', (1, 20, (2, 5))) +def test_setup_whittaker_diff_matrix(data_fixture2d, lam, diff_order): + """Ensures output difference matrix diagonal data is in desired format.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + assert algorithm.whittaker_system is None + + _ = algorithm._setup_whittaker(y, lam=lam, diff_order=diff_order) + + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + D1 = difference_matrix(len(x), diff_order_x) + D2 = difference_matrix(len(z), diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(len(z))) + P2 = lam_z * kron(identity(len(x)), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + algorithm.whittaker_system.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_whittaker_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size) + + _, weight_array = algorithm._setup_whittaker( + small_data2d, lam=1, diff_order=2, weights=weights + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +def test_setup_whittaker_wrong_weight_shape(small_data2d, algorithm): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=2, weights=weights) + + +@pytest.mark.parametrize('diff_order', (0, -1)) +def test_setup_whittaker_diff_matrix_fails(small_data2d, algorithm, diff_order): + """Ensures using a diff_order < 1 with _setup_whittaker raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=diff_order) + + +@pytest.mark.parametrize('diff_order', (4, 5)) +def test_setup_whittaker_diff_matrix_warns(small_data2d, algorithm, diff_order): + """Ensures using a diff_order > 3 with _setup_whittaker raises a warning.""" + with pytest.warns(ParameterWarning): + algorithm._setup_whittaker(small_data2d, lam=1, diff_order=diff_order) + + +def test_setup_whittaker_negative_lam_fails(small_data2d, algorithm): + """Ensures a negative lam value fails.""" + with pytest.raises(ValueError): + algorithm._setup_whittaker(small_data2d, lam=-1) + + +def test_setup_whittaker_array_lam(small_data2d): + """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" + num_x, num_z = small_data2d.shape + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1] + ) + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1, 2] + ) + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_whittaker( + small_data2d, lam=[1, 2, 3] + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_polynomial_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correctly handled.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size) + + _, weight_array = algorithm._setup_polynomial(small_data2d, weights=weights) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +def test_setup_polynomial_wrong_weight_shape(small_data2d, algorithm): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, weights=weights) + + +@pytest.mark.parametrize('poly_order', (0, 2, 4, (2, 4))) +@pytest.mark.parametrize('vander_enum', (0, 1, 2, 3)) +@pytest.mark.parametrize('include_pinv', (True, False)) +def test_setup_polynomial_vandermonde(small_data2d, algorithm, vander_enum, include_pinv, + poly_order): + """Ensures that the Vandermonde matrix and the pseudo-inverse matrix are correct.""" + if vander_enum == 0: + # no weights specified + weights = None + elif vander_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + elif vander_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + elif vander_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + + output = algorithm._setup_polynomial( + small_data2d, weights=weights, poly_order=poly_order, calc_vander=True, + calc_pinv=include_pinv + ) + if include_pinv: + _, weight_array, pinv_matrix = output + else: + _, weight_array = output + + if isinstance(poly_order, int): + x_order = poly_order + z_order = poly_order + else: + x_order, z_order = poly_order + + mapped_x = np.polynomial.polyutils.mapdomain(algorithm.x, algorithm.x_domain, [-1, 1]) + mapped_z = np.polynomial.polyutils.mapdomain(algorithm.z, algorithm.z_domain, [-1, 1]) + desired_vander = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), (x_order, z_order) + ).reshape((-1, (x_order + 1) * (z_order + 1))) + assert_allclose(desired_vander, algorithm.vandermonde, 1e-12) + + if include_pinv: + desired_pinv = np.linalg.pinv(np.sqrt(weight_array)[:, np.newaxis] * desired_vander) + assert_allclose(desired_pinv, pinv_matrix, 1e-10) + + +def test_setup_polynomial_negative_polyorder_fails(small_data2d, algorithm): + """Ensures a negative poly_order raises an exception.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=-1) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, -1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[-1, 1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[-1, -1]) + + +def test_setup_polynomial_too_large_polyorder_fails(small_data2d, algorithm): + """Ensures an exception is raised if poly_order has more than two values.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=[1, 2, 3, 4]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, poly_order=np.array([1, 2, 3])) + + +def test_setup_polynomial_maxcross(small_data2d, algorithm): + """Ensures the _max_cross attribute is updated after calling _setup_polynomial.""" + algorithm._setup_polynomial(small_data2d, max_cross=[1]) + assert algorithm._max_cross == 1 + + algorithm._setup_polynomial(small_data2d, max_cross=1) + assert algorithm._max_cross == 1 + + algorithm._setup_polynomial(small_data2d, max_cross=0) + assert algorithm._max_cross == 0 + + algorithm._setup_polynomial(small_data2d, max_cross=None) + assert algorithm._max_cross is None + + +def test_setup_polynomial_too_large_maxcross_fails(small_data2d, algorithm): + """Ensures an exception is raised if max_cross has more than one value.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[1, 2]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[1, 2, 3]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=np.array([1, 2])) + + +def test_setup_polynomial_negative_maxcross_fails(small_data2d, algorithm): + """Ensures an exception is raised if max_cross is negative.""" + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=[-1]) + + with pytest.raises(ValueError): + algorithm._setup_polynomial(small_data2d, max_cross=-2) + + +def test_setup_smooth_shape(small_data2d, algorithm): + """Ensures output y is correctly padded.""" + pad_length = 4 + y, hw = algorithm._setup_smooth(small_data2d, pad_length, mode='edge') + assert_array_equal( + y.shape, (small_data2d.shape[0] + 2 * pad_length, small_data2d.shape[1] + 2 * pad_length) + ) + assert_array_equal(hw, [pad_length, pad_length]) + + +@pytest.mark.parametrize('num_knots', (10, 30, (20, 30))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3))) +def test_setup_spline_spline_basis(data_fixture2d, num_knots, spline_degree): + """Ensures the spline basis function is correctly created.""" + x, z, y = data_fixture2d + fitter = _algorithm_setup._Algorithm2D(x, z) + assert fitter.pspline is None + + _ = fitter._setup_spline( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots + ) + + if isinstance(num_knots, int): + num_knots_r = num_knots + num_knots_c = num_knots + else: + num_knots_r, num_knots_c = num_knots + if isinstance(spline_degree, int): + spline_degree_x = spline_degree + spline_degree_z = spline_degree + else: + spline_degree_x, spline_degree_z = spline_degree + + assert_array_equal( + fitter.pspline.basis_r.shape, + (len(x), num_knots_r + spline_degree_x - 1) + ) + assert_array_equal( + fitter.pspline.basis_c.shape, + (len(z), num_knots_c + spline_degree_z - 1) + ) + + +@pytest.mark.parametrize('lam', (1, 20, (3, 10))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('num_knots', (20, 51, (20, 30))) +def test_setup_spline_diff_matrix(data_fixture2d, lam, diff_order, spline_degree, num_knots): + """Ensures output difference matrix diagonal data is in desired format.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + _ = algorithm._setup_spline( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs( + num_knots=num_knots, spline_degree=spline_degree, lam=lam, diff_order=diff_order + ) + + num_bases_x = num_knots_r + spline_degree_x - 1 + num_bases_z = num_knots_c + spline_degree_z - 1 + + D1 = difference_matrix(num_bases_x, diff_order_x) + D2 = difference_matrix(num_bases_z, diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases_z)) + P2 = lam_z * kron(identity(num_bases_x), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + algorithm.pspline.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + + +@pytest.mark.filterwarnings('ignore::UserWarning') +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4)) +@pytest.mark.parametrize('num_knots', (5, 50, 100)) +def test_setup_spline_too_high_diff_order(small_data2d, spline_degree, num_knots): + """ + Ensures an exception is raised when the difference order is >= number of basis functions. + + The number of basis functions is equal to the number of knots + the spline degree - 1. + Tests both difference order equal to and greater than the number of basis functions. + + """ + num_z, num_x = small_data2d.shape + diff_order = num_knots + spline_degree - 1 + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=spline_degree, num_knots=num_knots, + penalized=True, diff_order=diff_order + ) + + diff_order += 1 + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=spline_degree, num_knots=num_knots, + penalized=True, diff_order=diff_order + ) + + +@pytest.mark.parametrize('num_knots', (0, 1)) +def test_setup_spline_too_few_knots(small_data2d, num_knots): + """Ensures an error is raised if the number of knots is less than 2.""" + num_x, num_z = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=None, spline_degree=3, num_knots=num_knots, + penalized=True, diff_order=1 + ) + + +def test_setup_spline_wrong_weight_shape(small_data2d): + """Ensures that an exception is raised if input weights and data are different shapes.""" + weights = np.ones(np.array(small_data2d.shape) + 1) + num_x, num_z = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, weights=weights + ) + + +@pytest.mark.parametrize('diff_order', (0, -1)) +def test_setup_spline_diff_matrix_fails(small_data2d, diff_order): + """Ensures using a diff_order < 1 with _setup_spline raises an exception.""" + num_x, num_z = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, diff_order=diff_order + ) + + +@pytest.mark.parametrize('diff_order', (5, 6)) +def test_setup_spline_diff_matrix_warns(small_data2d, diff_order): + """Ensures using a diff_order > 4 with _setup_spline raises a warning.""" + num_x, num_z = small_data2d.shape + with pytest.warns(ParameterWarning): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, diff_order=diff_order + ) + + +def test_setup_spline_negative_lam_fails(small_data2d): + """Ensures a negative lam value fails.""" + num_x, num_z = small_data2d.shape + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, lam=-1 + ) + + +def test_setup_spline_array_lam(small_data2d): + """Ensures a lam that is a single array of one or two values passes while larger arrays fail.""" + num_x, num_z = small_data2d.shape + _algorithm_setup._Algorithm2D( + np.arange(num_x), np.arange(num_z) + )._setup_spline(small_data2d, lam=[1]) + _algorithm_setup._Algorithm2D( + np.arange(num_x), np.arange(num_z) + )._setup_spline(small_data2d, lam=[1, 2]) + with pytest.raises(ValueError): + _algorithm_setup._Algorithm2D(np.arange(num_x), np.arange(num_z))._setup_spline( + small_data2d, lam=[1, 2, 3] + ) + + +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_spline_weights(small_data2d, algorithm, weight_enum): + """Ensures output weight array is correct.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones_like(small_data2d) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones_like(small_data2d) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + + _, weight_array = algorithm._setup_spline( + small_data2d, lam=1, diff_order=2, weights=weights + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + + +@pytest.mark.parametrize('input_x', (True, False)) +@pytest.mark.parametrize('input_z', (True, False)) +@pytest.mark.parametrize('check_finite', (True, False)) +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +def test_algorithm_class_init(input_x, input_z, check_finite, assume_sorted, output_dtype, + change_order): + """Tests the initialization of _Algorithm2D objects.""" + sort_order = slice(0, 10) + expected_x = None + expected_z = None + x = None + z = None + if input_x or input_z: + x_, z_, _ = get_data2d() + if input_x: + x = x_ + if input_z: + z = z_ + + if input_x: + expected_x = x.copy() + if change_order: + x[sort_order] = x[sort_order][::-1] + if assume_sorted: + expected_x[sort_order] = expected_x[sort_order][::-1] + if input_z: + expected_z = z.copy() + if change_order: + z[sort_order] = z[sort_order][::-1] + if assume_sorted: + expected_z[sort_order] = expected_z[sort_order][::-1] + + algorithm = _algorithm_setup._Algorithm2D( + x, z, check_finite=check_finite, assume_sorted=assume_sorted, output_dtype=output_dtype + ) + assert_array_equal(algorithm.x, expected_x) + assert_array_equal(algorithm.z, expected_z) + assert algorithm._check_finite == check_finite + assert algorithm._dtype == output_dtype + + expected_shape = [None, None] + if input_x: + expected_shape[0] = len(x) + if input_z: + expected_shape[1] = len(z) + assert algorithm._len == expected_shape + + if not assume_sorted and change_order and (input_x or input_z): + if input_x and input_z: + x_order = np.arange(len(x)) + z_order = np.arange(len(z)) + for order in (x_order, z_order): + order[sort_order] = order[sort_order][::-1] + + for actual, expected in zip( + algorithm._sort_order, (x_order[:, None], z_order[None, :]) + ): + assert_array_equal(actual, expected) + for actual, expected in zip( + algorithm._inverted_order, (x_order.argsort()[:, None], z_order.argsort()[None, :]) + ): + assert_array_equal(actual, expected) + elif input_x: + order = np.arange(len(x)) + order[sort_order] = order[sort_order][::-1] + assert_array_equal(algorithm._sort_order, order) + assert_array_equal(algorithm._inverted_order, order.argsort()) + else: + order = np.arange(len(z)) + order[sort_order] = order[sort_order][::-1] + assert_array_equal(algorithm._sort_order[1], order) + assert_array_equal(algorithm._inverted_order[1], order.argsort()) + assert algorithm._sort_order[0] is Ellipsis + assert algorithm._inverted_order[0] is Ellipsis + else: + assert algorithm._sort_order is None + assert algorithm._inverted_order is None + + # ensure attributes are correctly initialized + assert algorithm.poly_order == -1 + assert algorithm.pspline is None + assert algorithm.whittaker_system is None + assert algorithm.vandermonde is None + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('reshape_baseline', (True, False)) +@pytest.mark.parametrize('three_d', (True, False)) +def test_algorithm_return_results(assume_sorted, output_dtype, change_order, reshape_baseline, + three_d): + """Ensures the _return_results method returns the correctly sorted outputs.""" + x, z, y = get_data2d() + baseline = np.arange(y.size).reshape(y.shape) + # 'a' values will be sorted, 'b' values will be kept the same, 'c' will be reshaped, + # and 'd' will be reshaped and then sorted + params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size), + 'd': np.arange(y.size), + } + if change_order: + x = x[::-1] + z = z[::-1] + y = y[::-1, ::-1] + + expected_params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size).reshape(y.shape), + 'd': np.arange(y.size).reshape(y.shape), + } + if three_d: + baseline = np.array([baseline, baseline]) + expected_baseline = baseline.copy() + if reshape_baseline: + baseline = baseline.reshape(baseline.shape[0], -1) + + if change_order and not assume_sorted: + expected_baseline = expected_baseline[..., ::-1, ::-1] + expected_params['a'] = expected_params['a'][::-1, ::-1] + expected_params['d'] = expected_params['d'][::-1, ::-1] + + algorithm = _algorithm_setup._Algorithm2D( + x, z, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + output, output_params = algorithm._return_results( + baseline, params, dtype=output_dtype, sort_keys=('a', 'd'), + reshape_baseline=reshape_baseline, reshape_keys=('c', 'd'), + ensure_2d=not three_d + ) + + assert_allclose(output, expected_baseline, 1e-14, 1e-14) + assert output.dtype == output_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key]) + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('skip_sorting', (True, False)) +@pytest.mark.parametrize('list_input', (True, False)) +def test_algorithm_register(assume_sorted, output_dtype, change_order, skip_sorting, list_input): + """ + Ensures the _register wrapper method returns the correctly sorted and shaped outputs. + + The input y-values within the wrapped function should be correctly sorted + if `assume_sorted` is False, while the output baseline should always match + the ordering of the input y-values. The output params should have an inverted + sort order to also match the ordering of the input y-values if `assume_sorted` + is False. + + """ + x, z, y = get_data2d() + + class SubClass(_algorithm_setup._Algorithm2D): + # 'a' values will be sorted and 'b' values will be kept the same + @_algorithm_setup._Algorithm2D._register(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) + def func(self, data, *args, **kwargs): + """For checking sorting and reshaping output parameters.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(self.x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + return 1 * data, params + + @_algorithm_setup._Algorithm2D._register(reshape_baseline=True) + def func2(self, data, *args, **kwargs): + """For checking reshaping output baseline.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return 1 * data.flatten(), {} + + @_algorithm_setup._Algorithm2D._register + def func3(self, data, *args, **kwargs): + """For checking empty decorator.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and assume_sorted: + expected_y = expected_y[::-1, ::-1] + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return 1 * data, {} + + @_algorithm_setup._Algorithm2D._register( + sort_keys=('a', 'd'), reshape_keys=('c', 'd'), skip_sorting=skip_sorting + ) + def func4(self, data, *args, **kwargs): + """For checking skip_sorting key.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and (assume_sorted or skip_sorting): + expected_y = expected_y[::-1, ::-1] + if change_order and assume_sorted: + expected_x = expected_x[::-1] + expected_z = expected_z[::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(self.x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + + return 1 * data, params + + if change_order: + x = x[::-1] + z = z[::-1] + y = y[::-1, ::-1] + expected_params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size).reshape(y.shape), + 'd': np.arange(y.size).reshape(y.shape), + } + expected_baseline = (1 * y).astype(output_dtype) + if output_dtype is None: + expected_dtype = y.dtype + else: + expected_dtype = expected_baseline.dtype + if list_input: + x = x.tolist() + z = z.tolist() + y = y.tolist() + + if change_order and not assume_sorted: + # if assume_sorted is False, the param order should be inverted to match + # the input y-order + expected_params['a'] = expected_params['a'][::-1, ::-1] + expected_params['d'] = expected_params['d'][::-1, ::-1] + + algorithm = SubClass( + x, z, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + output, output_params = algorithm.func(y) + + # baseline should always match y-order on the output; only sorted within the + # function + assert_allclose(output, expected_baseline, 1e-14, 1e-14) + assert isinstance(output, np.ndarray) + assert output.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key], err_msg=f'{key} failed') + + output2, _ = algorithm.func2(y) + assert_allclose(output2, expected_baseline, 1e-14, 1e-14) + assert isinstance(output2, np.ndarray) + assert output2.dtype == expected_dtype + + output3, _ = algorithm.func3(y) + assert_allclose(output3, expected_baseline, 1e-14, 1e-14) + assert isinstance(output3, np.ndarray) + assert output3.dtype == expected_dtype + + output4, output_params4 = algorithm.func4(y) + assert_allclose(output4, expected_baseline, 1e-14, 1e-14) + assert isinstance(output4, np.ndarray) + assert output4.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params4[key], err_msg=f'{key} failed') + + +def test_algorithm_register_no_data_fails(): + """Ensures an error is raised if the input data is None.""" + + class SubClass(_algorithm_setup._Algorithm2D): + + @_algorithm_setup._Algorithm2D._register + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._Algorithm2D._register() + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func() + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func2() + + +def test_algorithm_register_1d_fails(data_fixture): + """Ensures an error is raised if 1D data is used for 2D algorithms.""" + + class SubClass(_algorithm_setup._Algorithm2D): + + @_algorithm_setup._Algorithm2D._register + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._Algorithm2D._register() + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + x, y = data_fixture + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # also test when given x values + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # and when y is 2D but only has one row + y_2d = np.atleast_2d(y) + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + # and when y is 2D but only has one column + y_2d_transposed = np.atleast_2d(y).T + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + algorithm = SubClass(x) # x now correspond to the rows in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + +def test_override_x(algorithm): + """Ensures the `override_x` method correctly initializes with the new x values.""" + new_len = 20 + new_x = np.arange(new_len) + with pytest.raises(NotImplementedError): + with algorithm._override_x(new_x) as new_algorithm: + assert len(new_algorithm.x) == new_len + assert new_algorithm._len == new_len + assert new_algorithm.poly_order == -1 + assert new_algorithm.vandermonde is None + assert new_algorithm.whittaker_system is None + assert new_algorithm.pspline is None + + +@pytest.mark.parametrize( + 'method_and_outputs', ( + ('collab_pls', 'collab_pls', 'optimizers'), + ('COLLAB_pls', 'collab_pls', 'optimizers'), + ('modpoly', 'modpoly', 'polynomial'), + ('asls', 'asls', 'whittaker') + ) +) +def test_get_function(algorithm, method_and_outputs): + """Ensures _get_function gets the correct method, regardless of case.""" + method, expected_func, expected_module = method_and_outputs + tested_modules = [optimizers, polynomial, whittaker] + selected_func, module, class_object = algorithm._get_function( + method, tested_modules + ) + assert selected_func.__name__ == expected_func + assert module == expected_module + assert isinstance(class_object, _algorithm_setup._Algorithm2D) + + +def test_get_function_fails_wrong_method(algorithm): + """Ensures _get_function fails when an no function with the input name is available.""" + with pytest.raises(AttributeError): + algorithm._get_function('unknown function', [optimizers]) + + +def test_get_function_fails_no_module(algorithm): + """Ensures _get_function fails when not given any modules to search.""" + with pytest.raises(AttributeError): + algorithm._get_function('collab_pls', []) + + +def test_get_function_sorting_x(): + """Ensures the sort order is correct for the output class object when x is reversed.""" + num_points = 10 + x = np.arange(num_points) + ordering = np.arange(num_points) + algorithm = _algorithm_setup._Algorithm2D(x[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.x, x) + assert_array_equal(class_object._sort_order, ordering[::-1]) + assert_array_equal(class_object._inverted_order, ordering[::-1]) + assert_array_equal(class_object._sort_order, algorithm._sort_order) + assert_array_equal(class_object._inverted_order, algorithm._inverted_order) + + +def test_get_function_sorting_z(): + """Ensures the sort order is correct for the output class object when z is reversed.""" + num_points = 10 + z = np.arange(num_points) + ordering = np.arange(num_points) + algorithm = _algorithm_setup._Algorithm2D(None, z[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.z, z) + assert class_object._sort_order[0] is Ellipsis + assert class_object._inverted_order[0] is Ellipsis + assert algorithm._sort_order[0] is Ellipsis + assert algorithm._inverted_order[0] is Ellipsis + assert_array_equal(class_object._sort_order[1], ordering[::-1]) + assert_array_equal(class_object._inverted_order[1], ordering[::-1]) + assert_array_equal(class_object._sort_order[1], algorithm._sort_order[1]) + assert_array_equal(class_object._inverted_order[1], algorithm._inverted_order[1]) + + +def test_get_function_sorting_xz(): + """Ensures the sort order is correct for the output class object when x and z are reversed.""" + num_x_points = 10 + num_z_points = 11 + x = np.arange(num_x_points) + x_ordering = np.arange(num_x_points) + z = np.arange(num_z_points) + z_ordering = np.arange(num_z_points) + + algorithm = _algorithm_setup._Algorithm2D(x[::-1], z[::-1], assume_sorted=False) + func, func_module, class_object = algorithm._get_function('asls', [whittaker]) + + assert_array_equal(class_object.x, x) + assert_array_equal(class_object.z, z) + assert_array_equal(class_object._sort_order[0], x_ordering[::-1][:, None]) + assert_array_equal(class_object._sort_order[1], z_ordering[::-1][None, :]) + assert_array_equal(class_object._inverted_order[0], x_ordering[::-1][:, None]) + assert_array_equal(class_object._inverted_order[1], z_ordering[::-1][None, :]) + assert_array_equal(class_object._sort_order[0], algorithm._sort_order[0]) + assert_array_equal(class_object._sort_order[1], algorithm._sort_order[1]) + assert_array_equal(class_object._inverted_order[0], algorithm._inverted_order[0]) + assert_array_equal(class_object._inverted_order[1], algorithm._inverted_order[1]) + + +@pytest.mark.parametrize('method_kwargs', (None, {'a': 2})) +def test_setup_optimizer(small_data2d, algorithm, method_kwargs): + """Ensures output of _setup_optimizer is correct.""" + y, fit_func, func_module, output_kwargs, class_object = algorithm._setup_optimizer( + small_data2d, 'asls', [whittaker], method_kwargs + ) + + assert isinstance(y, np.ndarray) + assert_allclose(y, small_data2d) + assert fit_func.__name__ == 'asls' + assert func_module == 'whittaker' + assert isinstance(output_kwargs, dict) + assert isinstance(class_object, _algorithm_setup._Algorithm2D) + + +@pytest.mark.parametrize('copy_kwargs', (True, False)) +def test_setup_optimizer_copy_kwargs(small_data2d, algorithm, copy_kwargs): + """Ensures the copy behavior of the input keyword argument dictionary.""" + input_kwargs = {'a': 1} + y, _, _, output_kwargs, _ = algorithm._setup_optimizer( + small_data2d, 'asls', [whittaker], input_kwargs, copy_kwargs + ) + + output_kwargs['a'] = 2 + if copy_kwargs: + assert input_kwargs['a'] == 1 + else: + assert input_kwargs['a'] == 2 diff --git a/tests/two_d/test_api.py b/tests/two_d/test_api.py new file mode 100644 index 0000000..bba287f --- /dev/null +++ b/tests/two_d/test_api.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.api. + +@author: Donald Erb +Created on July 3, 2021 + +""" + +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from pybaselines.two_d import ( + api, morphological, optimizers, polynomial, smooth, spline, whittaker +) + +from ..conftest import get_data2d + + +_ALL_CLASSES = ( + morphological._Morphological, + optimizers._Optimizers, + polynomial._Polynomial, + smooth._Smooth, + spline._Spline, + whittaker._Whittaker +) + + +def get_public_methods(klass): + """ + Gets all public methods from a class. + + Parameters + ---------- + klass : class + The class to use. + + Returns + ------- + list[str, ...] + The list of all public methods of the input class. + + """ + methods = [] + for method in dir(klass): + if ( + not (method.startswith('_') + or method.startswith('pentapy_solver') + or method.startswith('get_method')) + ): + methods.append(method) + return methods + + +# will be like [('asls', whittaker._Whittaker), ('modpoly', polynomial._Polynomial), ...] +_ALL_CLASSES_AND_METHODS = [] +for klass in _ALL_CLASSES: + for method in get_public_methods(klass): + _ALL_CLASSES_AND_METHODS.append((method, klass)) + + +class TestBaseline2D: + """Class for testing the Baseline2D class.""" + + algorithm_base = api.Baseline2D + + @classmethod + def setup_class(cls): + """Sets up the class for testing.""" + cls.x, cls.z, cls.y = get_data2d() + cls.algorithm = cls.algorithm_base(cls.x, cls.z, check_finite=False, assume_sorted=True) + + @classmethod + def teardown_class(cls): + """ + Resets class attributes after testing. + + Probably not needed, but done anyway to catch changes in how pytest works. + + """ + cls.x = None + cls.z = None + cls.y = None + cls.algorithm = None + + @pytest.mark.parametrize('method_and_class', _ALL_CLASSES_AND_METHODS) + def test_all_methods(self, method_and_class): + """Ensures all available methods work the same when accessing through Baseline class.""" + method, baseline_class = method_and_class + # collab_pls needs 2D input data + if method == 'collab_pls': + fit_data = np.array((self.y, self.y)) + else: + fit_data = self.y + + # need to handle some specific methods + if method == 'optimize_extended_range': + kwargs = {'method': 'modpoly'} + elif method == 'interp_pts': + kwargs = {'baseline_points': ((5, 10), (10, 20), (90, 100))} + elif method == 'golotvin': + # have to set kwargs for golotvin or else no baseline points are found + kwargs = {'half_window': 15, 'num_std': 6} + else: + kwargs = {} + + api_baseline, api_params = getattr(self.algorithm, method)(fit_data, **kwargs) + class_baseline, class_params = getattr( + baseline_class(self.x, self.z, check_finite=False, assume_sorted=True), method + )(fit_data, **kwargs) + + assert_allclose(api_baseline, class_baseline, rtol=1e-12, atol=1e-12) + assert len(api_params.keys()) == len(class_params.keys()) + for key, value in api_params.items(): + assert key in class_params + class_value = class_params[key] + if isinstance(value, (int, float, np.ndarray, list, tuple)): + assert_allclose(value, class_value, rtol=1e-12, atol=1e-12) + else: + assert value == class_value + + def test_method_availability(self): + """Ensures all public algorithms are available through the Baseline class.""" + total_methods_list = get_public_methods(api.Baseline2D) + total_methods = set(total_methods_list) + + # ensure no repeated methods + assert len(total_methods) == len(total_methods_list) + + for klass in _ALL_CLASSES: + assert issubclass(self.algorithm_base, klass) + class_methods = set(get_public_methods(klass)) + # all individual class methods should be in Baseline + assert len(class_methods - total_methods) == 0 + total_methods = total_methods - class_methods + + # no additional methods should be available + assert len(total_methods) == 0 + + def test_get_method(self): + """Ensures the get_method helper function works as intended.""" + method = self.algorithm._get_method('asls') + assert method == self.algorithm.asls + + # also ensure capitalization does not matter + method2 = self.algorithm._get_method('AsLS') + assert method2 == self.algorithm.asls + + def test_get_method_fails(self): + """Ensures the get_method helper function fails when an incorrect name is given.""" + with pytest.raises(AttributeError): + self.algorithm._get_method('aaaaaaaaaaaaa') diff --git a/tests/two_d/test_morphological.py b/tests/two_d/test_morphological.py new file mode 100644 index 0000000..0191dd2 --- /dev/null +++ b/tests/two_d/test_morphological.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.morphological. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +import numpy as np +import pytest + +from pybaselines.two_d import morphological + +from ..conftest import BaseTester2D + + +class MorphologicalTester(BaseTester2D): + """Base testing class for morphological functions.""" + + module = morphological + algorithm_base = morphological._Morphological + checked_keys = ('half_window',) + + @pytest.mark.parametrize('half_window', (None, 10, [10, 12], np.array([12, 10]))) + def test_half_window(self, half_window): + """Ensures that different inputs for half_window work.""" + self.class_func(self.y, half_window=half_window) + + +class IterativeMorphologicalTester(MorphologicalTester): + """Base testing class for iterative morphological functions.""" + + checked_keys = ('half_window', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestMor(MorphologicalTester): + """Class for testing mor baseline.""" + + func_name = 'mor' + + +class TestIMor(IterativeMorphologicalTester): + """Class for testing imor baseline.""" + + func_name = 'imor' + + +class TestRollingBall(MorphologicalTester): + """Class for testing rolling_ball baseline.""" + + func_name = 'rolling_ball' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('half_window', (None, 10, [10, 12])) + @pytest.mark.parametrize('smooth_half_window', (None, 0, 1)) + def test_unchanged_data(self, new_instance, half_window, smooth_half_window): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, half_window=half_window, smooth_half_window=smooth_half_window + ) + + @pytest.mark.parametrize('smooth_half_window', (None, 0, 10, [0, 0], [10, 10])) + def test_smooth_half_windows(self, smooth_half_window): + """Ensures smooth-half-window is correctly processed.""" + output = self.class_func(self.y, smooth_half_window=smooth_half_window) + + assert output[0].shape == self.y.shape + + @pytest.mark.parametrize('smooth_half_window', (-1, [5, -1], [-1, 5], [-2, -3])) + def test_negative_smooth_half_window_fails(self, smooth_half_window): + """Ensures a negative smooth-half-window raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, smooth_half_window=smooth_half_window) + + +class TestTophat(MorphologicalTester): + """Class for testing tophat baseline.""" + + func_name = 'tophat' diff --git a/tests/two_d/test_optimizers.py b/tests/two_d/test_optimizers.py new file mode 100644 index 0000000..733c1d6 --- /dev/null +++ b/tests/two_d/test_optimizers.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.optimizers. + +@author: Donald Erb +Created on January 14, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest + +from pybaselines import utils +from pybaselines.two_d import optimizers, polynomial + +from ..conftest import BaseTester2D, InputWeightsMixin + + +class OptimizerInputWeightsMixin(InputWeightsMixin): + """Passes weights within the `method_kwargs` dictionary.""" + + def test_input_weights(self, assertion_kwargs=None, **kwargs): + """ + Ensures arrays are correctly sorted within the function. + + Returns the output for further testing. + + """ + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, self.y.shape[-2:]) + weights = np.clip(weights, 0, 1).astype(float, copy=False) + + reverse_fitter = self.algorithm_base(self.x[::-1], self.z[::-1], assume_sorted=False) + + regular_output, regular_output_params = self.class_func( + data=self.y, method_kwargs={'weights': weights}, **self.kwargs, **kwargs + ) + reverse_output, reverse_output_params = getattr(reverse_fitter, self.func_name)( + data=self.reverse_array(self.y), method_kwargs={'weights': self.reverse_array(weights)}, + **self.kwargs, **kwargs + ) + + if assertion_kwargs is None: + assertion_kwargs = {} + if 'rtol' not in assertion_kwargs: + assertion_kwargs['rtol'] = 1e-10 + if 'atol' not in assertion_kwargs: + assertion_kwargs['atol'] = 1e-14 + + for key in self.weight_keys: + assert_allclose( + regular_output_params[key], self.reverse_array(reverse_output_params[key]), + **assertion_kwargs + ) + assert_allclose( + regular_output, self.reverse_array(reverse_output), **assertion_kwargs + ) + + +class OptimizersTester(BaseTester2D): + """Base testing class for optimizer functions.""" + + module = optimizers + algorithm_base = optimizers._Optimizers + + +class TestCollabPLS(OptimizersTester, OptimizerInputWeightsMixin): + """Class for testing collab_pls baseline.""" + + func_name = "collab_pls" + # will need to change checked_keys if default method is changed + checked_keys = ('average_weights', 'weights', 'tol_history') + three_d = True + weight_keys = ('average_weights', 'weights') + + @pytest.mark.parametrize( + 'method', + ( + 'asls', 'iasls', 'airpls', 'arpls', 'drpls', 'iarpls', 'aspls', 'psalsa', + 'mixture_model', 'irsqr', 'pspline_asls', + 'pspline_airpls', 'pspline_arpls', + 'pspline_iarpls', 'pspline_psalsa', + ) + ) + def test_all_methods(self, method): + """Ensures all available methods work.""" + self.class_func(self.y, method=method) + + def test_unknown_method_fails(self): + """Ensures function fails when an unknown function is given.""" + with pytest.raises(AttributeError): + self.class_func(self.y, method='unknown function') + + def test_single_dataset_fails(self): + """Ensures an error is raised if the input has the shape (M, N).""" + with pytest.raises(ValueError, match='the input data must'): + self.class_func(np.arange(self.y[0].size).reshape(self.y.shape[-2:])) + + +@pytest.mark.parametrize( + 'baseline_ptp', (0.01, 0.1, 0.3, 0.5, 1, 5, 10, 40, 100, 200, 300, 500, 600, 1000) +) +def test_determine_polyorders(baseline_ptp): + """Ensures the correct polynomials are selected based on the signal to baseline ratio.""" + x = np.linspace(0, 100, 500) + z = np.linspace(0, 100, 400) + X, Z = np.meshgrid(x, z, indexing='ij') + # set y such that max(y) - min(y) is ~ 1 so that + # ptp(baseline) / ptp(y) ~= ptp(baseline) + y = ( + utils.gaussian2d(X, Z, 1, 25, 25, 2, 2) + + utils.gaussian2d(X, Z, 0.5, 50, 50, 2, 2) + + utils.gaussian2d(X, Z, 1, 75, 75, 2, 2) + ) + # use a linear baseline so that it's easy to set the peak-to-peak of the baseline + true_baseline = X * baseline_ptp / (x.max() - x.min()) + + # double check to make sure the system is setup as expected + assert_allclose(np.ptp(true_baseline), baseline_ptp, 0, 1e-3) + assert_allclose(np.ptp(y), 1, 0, 1e-3) + + fitter = polynomial._Polynomial(x, z, check_finite=False, assume_sorted=True) + + fit_baseline = fitter.modpoly(y + true_baseline, poly_order=1)[0] + # sanity check to make sure internal baseline fit was correct + assert_allclose(np.ptp(fit_baseline), baseline_ptp, 0, 5e-3) + + if baseline_ptp < 0.2: + expected_orders = (1, 2) + elif baseline_ptp < 0.75: + expected_orders = (2, 3) + elif baseline_ptp < 8.5: + expected_orders = (3, 4) + elif baseline_ptp < 55: + expected_orders = (4, 5) + elif baseline_ptp < 240: + expected_orders = (5, 6) + elif baseline_ptp < 517: + expected_orders = (6, 7) + else: + expected_orders = (6, 8) + + output_orders = optimizers._determine_polyorders( + y + true_baseline, poly_order=1, weights=None, fit_function=fitter.modpoly + ) + + assert_array_equal(output_orders, expected_orders) + + +class TestAdaptiveMinMax(OptimizersTester, InputWeightsMixin): + """Class for testing adaptive_minmax baseline.""" + + func_name = 'adaptive_minmax' + checked_keys = ('weights', 'constrained_weights', 'poly_order') + weight_keys = ('weights', 'constrained_weights') + + @pytest.mark.parametrize('method', ('modpoly', 'imodpoly')) + def test_methods(self, method): + """Ensures all available methods work.""" + self.class_func(self.y, method=method) + + def test_unknown_method_fails(self): + """Ensures function fails when an unknown function is given.""" + with pytest.raises(AttributeError): + self.class_func(self.y, method='unknown') + + @pytest.mark.parametrize('poly_order', (None, 0, [0], (0, 1))) + def test_polyorder_inputs(self, poly_order): + """Tests valid inputs for poly_order.""" + self.class_func(self.y, poly_order) + + @pytest.mark.parametrize('poly_order', (0, [0], (0, 1))) + def test_polyorder_outputs(self, poly_order): + """Ensures that the correct polynomial orders were used.""" + _, params = self.class_func(self.y, poly_order) + assert_array_equal(params['poly_order'], np.array([0, 1])) + + @pytest.mark.parametrize('poly_order', ([0, 1, 2], (0, 1, 2, 3))) + def test_too_many_polyorders_fails(self, poly_order): + """Ensures an error is raised if poly_order has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, poly_order) + + @pytest.mark.parametrize( + 'constrained_fraction', (0.01, [0.01], (0, 0.01), [0.01, 1], [0.01, 0.01, 0.01, 0.01]) + ) + def test_constrained_fraction_inputs(self, constrained_fraction): + """Tests valid inputs for constrained_fraction.""" + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize( + 'constrained_fraction', ([0.01, 0.02, 0.02], (0.01, 0.01, 0.01, 0.01, 0.01)) + ) + def test_too_many_constrained_fraction(self, constrained_fraction): + """Ensures an error is raised if constrained_fraction has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize('constrained_fraction', (-0.5, [-0.01, 0.02], 1.1, [0.05, 1.1])) + def test_invalid_constrained_fraction(self, constrained_fraction): + """Ensures an error is raised if constrained_fraction is outside of [0, 1].""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_fraction=constrained_fraction) + + @pytest.mark.parametrize('constrained_weight', (1e5, [1e5], (1e3, 1e5), [1e3, 1e3, 1e3, 1e3])) + def test_constrained_weight_inputs(self, constrained_weight): + """Tests valid inputs for constrained_weight.""" + self.class_func(self.y, constrained_weight=constrained_weight) + + @pytest.mark.parametrize('constrained_weight', ([1e4, 1e2, 1e5], (1e3, 1e3, 1e3, 1e3, 1e3))) + def test_too_many_constrained_weight(self, constrained_weight): + """Ensures an error is raised if constrained_weight has more than two items.""" + with pytest.raises(ValueError): + self.class_func(self.y, constrained_weight=constrained_weight) + + def test_input_weights(self): + """Ensures the input weights are sorted correctly.""" + # use different weightings and constrained fractions for left and right + # sides that that if weights are reversed, there is a clear difference + weightings = np.array([1e4, 1e5, 1e4, 1e5]) + constrained_fractions = np.array([0.01, 0.02, 0.01, 0.02]) + super().test_input_weights( + constrained_weight=weightings, constrained_fraction=constrained_fractions + ) diff --git a/tests/two_d/test_polynomial.py b/tests/two_d/test_polynomial.py new file mode 100644 index 0000000..ef84ad4 --- /dev/null +++ b/tests/two_d/test_polynomial.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.polynomial. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +from math import ceil + +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from pybaselines.two_d import polynomial + +from ..conftest import BasePolyTester2D, InputWeightsMixin +from ..data import STATSMODELS_QUANTILES_2D + + +class PolynomialTester(BasePolyTester2D, InputWeightsMixin): + """Base testing class for polynomial functions.""" + + module = polynomial + algorithm_base = polynomial._Polynomial + checked_keys = ('weights',) + + +class IterativePolynomialTester(PolynomialTester): + """Base testing class for iterative polynomial functions.""" + + checked_keys = ('weights', 'tol_history') + allows_zero_iteration = True # whether max_iter=0 will return an initial baseline + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + if self.allows_zero_iteration: + assert params['tol_history'].size == max_iter + else: + assert params['tol_history'].size == max_iter + 1 + + +class TestPoly(PolynomialTester): + """Class for testing regular polynomial baseline.""" + + func_name = 'poly' + + +class TestModPoly(IterativePolynomialTester): + """Class for testing modpoly baseline.""" + + func_name = 'modpoly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('use_original', (True, False)) + @pytest.mark.parametrize('mask_initial_peaks', (True, False)) + def test_unchanged_data(self, new_instance, use_original, mask_initial_peaks): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, use_original=use_original, mask_initial_peaks=mask_initial_peaks + ) + + +class TestIModPoly(IterativePolynomialTester): + """Class for testing imodpoly baseline.""" + + func_name = 'imodpoly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('use_original', (True, False)) + @pytest.mark.parametrize('mask_initial_peaks', (True, False)) + def test_unchanged_data(self, new_instance, use_original, mask_initial_peaks): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data( + new_instance, use_original=use_original, mask_initial_peaks=mask_initial_peaks + ) + + @pytest.mark.parametrize('num_std', (-1, -0.01, 0, 1)) + def test_negative_num_std_fails(self, num_std): + """Ensures `num_std` values less than 0 raise an exception.""" + if num_std < 0: + with pytest.raises(ValueError): + self.class_func(self.y, num_std=num_std) + else: + self.class_func(self.y, num_std=num_std) + + +class TestPenalizedPoly(IterativePolynomialTester): + """Class for testing penalized_poly baseline.""" + + func_name = 'penalized_poly' + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize( + 'cost_function', + ( + 'asymmetric_truncated_quadratic', + 'symmetric_truncated_quadratic', + 'a_truncated_quadratic', # test that 'a' and 's' work as well + 's_truncated_quadratic', + 'asymmetric_huber', + 'symmetric_huber', + 'asymmetric_indec', + 'symmetric_indec' + ) + ) + def test_unchanged_data(self, new_instance, cost_function): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data(new_instance, cost_function=cost_function) + + @pytest.mark.parametrize('cost_function', ('huber', 'p_huber', '')) + def test_unknown_cost_function_prefix_fails(self, cost_function): + """Ensures cost function with no prefix or a wrong prefix fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, cost_function=cost_function) + + def test_unknown_cost_function_fails(self): + """Ensures than an unknown cost function fails.""" + with pytest.raises(KeyError): + self.class_func(self.y, cost_function='a_hub') + + @pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) + def test_weighting(self, weight_enum): + """ + Tests that weighting is correctly applied by comparing to other algorithms. + + Weights were not included in the original penalized_poly method developed + in [1]_, so need to ensure that their usage in pybaselines is correct. + + According to [1]_ (and independently verified), the penalized_poly function + with the asymmetric truncated quadratic cost function, a threshold of 0, and + an alpha_factor of 1 should be the same as the output of the ModPoly algorithm. + + Furthermore, the penalized_poly with any symmetric cost function and a threshold + of infinity should equal to the output of a regular polynomial fit. + + Therefore, to ensure that weighting is correct for the penalized_poly, check + both conditions. + + References + ---------- + .. [1] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121–133. + + """ + if weight_enum == 0: + # all weights = 1 + weights = None + elif weight_enum == 1: + # same as all weights = 1, but would cause issues if weights were + # incorrectly multiplied + weights = 2 * np.ones_like(self.y) + elif weight_enum == 2: + # binary mask, only fitting the first half of the data + weights = np.ones_like(self.y) + weights[self.x < 0.5 * (np.max(self.x) + np.min(self.x))] = 0 + else: + # weight array where the two endpoints have weighting >> 1 + weights = np.ones_like(self.y) + fraction = max(1, ceil(self.y.shape[0] * 0.1)) + weights[:fraction] = 100 + weights[-fraction:] = 100 + + poly_order = 2 + tol = 1e-3 + + poly_baseline = polynomial._Polynomial(self.x, self.z).poly( + self.y, poly_order, weights=weights + )[0] + penalized_poly_1 = self.class_func( + self.y, poly_order, cost_function='s_huber', + threshold=1e10, weights=weights + )[0] + + assert_allclose(poly_baseline, penalized_poly_1, 1e-10) + + modpoly_baseline = polynomial._Polynomial(self.x, self.z).modpoly( + self.y, poly_order, tol=tol, weights=weights, use_original=True + )[0] + penalized_poly_2 = self.class_func( + self.y, poly_order, cost_function='a_truncated_quadratic', + threshold=0, weights=weights, alpha_factor=1, tol=tol + )[0] + + assert_allclose(modpoly_baseline, penalized_poly_2, 1e-10) + + @pytest.mark.parametrize('alpha_factor', (-0.1, 0, 1.01)) + def test_wrong_alpha_factor_fails(self, alpha_factor): + """Ensures an alpha factor outside of (0, 1] fails.""" + with pytest.raises(ValueError): + self.class_func(self.y, alpha_factor=alpha_factor) + + +class TestQuantReg(IterativePolynomialTester): + """Class for testing quant_reg baseline.""" + + func_name = 'quant_reg' + required_kwargs = {'tol': 1e-9} + + @pytest.mark.parametrize('quantile', (0, 1, -0.1, 1.1)) + def test_outside_quantile_fails(self, quantile): + """Ensures quantile values outside of (0, 1) raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, quantile=quantile) + + @pytest.mark.parametrize('quantile', tuple(STATSMODELS_QUANTILES_2D.keys())) + def test_compare_to_statsmodels(self, quantile): + """ + Compares the output of quant_reg to statsmodels's quantile regression implementation. + + The library statsmodels has a well-tested quantile regression implementation, + so can compare the output of polynomial.quant_reg to statsmodels to ensure + that the pybaselines implementation is correct. + + The outputs from statsmodels were created using:: + + from statsmodels.regression.quantile_regression import QuantReg + # map x and z to [-1, 1] to improve numerical stability for the Vandermonde + # within statsmodels + mapped_x = np.polynomial.polyutils.mapdomain( + x, np.polynomial.polyutils.getdomain(x), np.array([-1., 1.]) + ) + mapped_z = np.polynomial.polyutils.mapdomain( + z, np.polynomial.polyutils.getdomain(z), np.array([-1., 1.]) + ) + vander = np.polynomial.polynomial.polyvander2d( + *np.meshgrid(mapped_x, mapped_z, indexing='ij'), 1 + ).reshape((-1, 4)) + fitter = QuantReg(y.ravel(), vander).fit(quantile, max_iter=1000, p_tol=1e-9).predict() + + with statsmodels version 0.13.2. + + Could also compare with the "true" quantile regression result using linear + programming such as detailed in: + + https://stats.stackexchange.com/questions/384909/formulating-quantile-regression-as- + linear-programming-problem + + but the comparison to statsmodels is good enough since it uses an iteratively + reweighted least squares calculation for the quantile regression similar to the + pybaselines implementation, and the linear programming requires a scipy version + of at least 1.0 or 1.6 to get a fast, reliable result due to the older solvers not + working as well. + + """ + x = np.linspace(-1000, 1000, 25) + z = np.linspace(-200, 301, 31) + + X, Z = np.meshgrid(x, z, indexing='ij') + y = ( + 3 + 1e-2 * X - 5e-1 * Z + 1e-2 * X * Z + ) + np.random.default_rng(0).normal(0, 200, X.shape) + + output = self.algorithm_base(x, z, check_finite=False, assume_sorted=True).quant_reg( + y, poly_order=1, quantile=quantile, tol=1e-9, eps=1e-12 + ) + + # use slightly high rtol since the number of data points is small for 2D to not bog + # down the data file; for higher number of points, rtol and atol could be reduced + assert_allclose( + output[0].ravel(), STATSMODELS_QUANTILES_2D[quantile], rtol=1e-5, atol=1e-10 + ) diff --git a/tests/two_d/test_smooth.py b/tests/two_d/test_smooth.py new file mode 100644 index 0000000..d23a099 --- /dev/null +++ b/tests/two_d/test_smooth.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.smooth. + +@author: Donald Erb +Created on January 14, 2024 + +""" + +import pytest + +from pybaselines.two_d import smooth + +from ..conftest import BaseTester2D + + +class SmoothTester(BaseTester2D): + """Base testing class for whittaker functions.""" + + module = smooth + algorithm_base = smooth._Smooth + + +class TestNoiseMedian(SmoothTester): + """Class for testing noise median baseline.""" + + func_name = 'noise_median' + required_kwargs = {'half_window': 15} + + @pytest.mark.parametrize('new_instance', (True, False)) + @pytest.mark.parametrize('smooth_hw', (None, 0, 2)) + def test_unchanged_data(self, new_instance, smooth_hw): + """Ensures that input data is unchanged by the function.""" + super().test_unchanged_data(new_instance, smooth_half_window=smooth_hw) + + @pytest.mark.parametrize('half_window', (None, 15, [15, 15])) + def test_half_windows(self, half_window): + """Tests possible inputs for `half_window`.""" + self.class_func(self.y, half_window=half_window) diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py new file mode 100644 index 0000000..ed05ae9 --- /dev/null +++ b/tests/two_d/test_spline.py @@ -0,0 +1,453 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.splines. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +from unittest import mock + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest + +from pybaselines import utils +from pybaselines.two_d import spline, whittaker +from pybaselines._compat import trapezoid + +from ..conftest import BaseTester2D, InputWeightsMixin + + +@pytest.mark.parametrize('use_numba', (True, False)) +def test_mapped_histogram_simple(use_numba): + """Compares the output with numpy and the bin_mapping, testing corner cases.""" + num_bins = 10 + values = np.array([0, 0.01, 1, 1.5, 8, 9, 9.1, 10]) + expected_bin_edges = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=float) + expected_bin_mapping = np.array([0, 0, 1, 1, 8, 9, 9, 9], dtype=np.intp) + + np_histogram, np_bin_edges = np.histogram(values, num_bins, density=True) + assert_allclose(np_bin_edges, expected_bin_edges, rtol=0, atol=1e-12) + + with mock.patch.object(spline, '_HAS_NUMBA', use_numba): + histogram, bin_edges, bin_mapping = spline._mapped_histogram(values, num_bins) + + assert_allclose(histogram, np_histogram) + assert_allclose(bin_edges, np_bin_edges) + assert_array_equal(bin_mapping, expected_bin_mapping) + + +@pytest.mark.parametrize('rng_seed', (0, 1)) +@pytest.mark.parametrize('num_bins', (10, 100, 1000)) +@pytest.mark.parametrize('use_numba', (True, False)) +def test_mapped_histogram(rng_seed, num_bins, use_numba): + """Compares the output with numpy and the bin_mapping with a nieve version.""" + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + rng = np.random.RandomState(rng_seed) + values = rng.normal(0, 20, 1000) + np_histogram, np_bin_edges = np.histogram(values, num_bins, density=True) + with mock.patch.object(spline, '_HAS_NUMBA', use_numba): + histogram, bin_edges, bin_mapping = spline._mapped_histogram(values, num_bins) + + assert_allclose(histogram, np_histogram) + assert_allclose(bin_edges, np_bin_edges) + + expected_bin_mapping = np.zeros_like(values) + for i, left_bin in enumerate(bin_edges[:-1]): + mask = (values >= left_bin) & (values < bin_edges[i + 1]) + expected_bin_mapping[mask] = i + expected_bin_mapping[values >= bin_edges[-1]] = num_bins - 1 + + assert_array_equal(bin_mapping, expected_bin_mapping) + + +@pytest.mark.parametrize('fraction_pos', (0, 0.4)) +@pytest.mark.parametrize('fraction_neg', (0, 0.3)) +def test_mixture_pdf(fraction_pos, fraction_neg): + """Ensures the probability density function for the Gaussian-uniform mixture model is right.""" + x = np.linspace(-5, 10, 1000) + actual_sigma = 0.5 + sigma = np.log10(actual_sigma) + # the gaussian should be area-normalized, so set height accordingly + height = 1 / (actual_sigma * np.sqrt(2 * np.pi)) + expected_gaussian = utils.gaussian(x, height, 0, actual_sigma) + + fraction_gaus = 1 - fraction_pos - fraction_neg + if fraction_pos > 0: + pos_uniform = np.zeros_like(x) + pos_uniform[x >= 0] = 1 / abs(x.max()) + elif fraction_neg > 0: + pos_uniform = None + else: + pos_uniform = 0 + + if fraction_neg > 0: + neg_uniform = np.zeros_like(x) + neg_uniform[x <= 0] = 1 / abs(x.min()) + elif fraction_pos > 0: + neg_uniform = None + else: + neg_uniform = 0 + + output_pdf = spline._mixture_pdf( + x, fraction_gaus, sigma, fraction_pos, pos_uniform, neg_uniform + ) + + # now ensure neg_uniform and pos_uniform are not None + if pos_uniform is None: + pos_uniform = 0 + if neg_uniform is None: + neg_uniform = 0 + + expected_pdf = ( + fraction_gaus * expected_gaussian + + fraction_pos * pos_uniform + + fraction_neg * neg_uniform + ) + + assert_allclose(expected_pdf, output_pdf, rtol=1e-12, atol=1e-12) + # ensure pdf has an area of 1, ie total probability is 100%; accuracy is limited + # by number of x-values + assert_allclose(1.0, trapezoid(output_pdf, x), rtol=1e-3) + + +def compare_pspline_whittaker(pspline_class, whittaker_func, data, lam=1e5, + test_rtol=1e-6, test_atol=1e-12, uses_eigenvalues=True, **kwargs): + """ + Compares the output of the penalized spline (P-spline) versions of Whittaker functions. + + The number of knots for the P-splines are set to ``len(data) + 1`` and the spline + degree is set to 0; the result is that the spline basis becomes the identity matrix, + and the P-spline version should give the same output as the Whittaker version if + the weighting and linear systems were correctly set up. + + """ + if uses_eigenvalues: + added_kwargs = {'eigenvalues': None} + else: + added_kwargs = {} + whittaker_output = getattr( + whittaker._Whittaker(pspline_class.x, pspline_class.z), whittaker_func + )(data, lam=lam, **kwargs, **added_kwargs)[0] + + num_knots = np.array(data.shape) + 1 + if hasattr(pspline_class, 'class_func'): + spline_output = pspline_class.class_func( + data, lam=lam, num_knots=num_knots, spline_degree=0, **kwargs + )[0] + else: + spline_output = pspline_class._call_func( + data, lam=lam, num_knots=num_knots, spline_degree=0, **kwargs + )[0] + + assert_allclose(spline_output, whittaker_output, rtol=test_rtol, atol=test_atol) + + +class SplineTester(BaseTester2D): + """Base testing class for spline functions.""" + + module = spline + algorithm_base = spline._Spline + + +class IterativeSplineTester(SplineTester, InputWeightsMixin): + """Base testing class for iterative spline functions.""" + + checked_keys = ('weights', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestMixtureModel(IterativeSplineTester): + """Class for testing mixture_model baseline.""" + + func_name = 'mixture_model' + + @pytest.mark.parametrize('use_class', (True, False)) + @pytest.mark.parametrize('weight_bool', (True, False)) + def test_unchanged_data(self, use_class, weight_bool): + """Ensures that input data is unchanged by the function.""" + if weight_bool: + weights = np.ones_like(self.y) + else: + weights = None + super().test_unchanged_data(use_class, weights=weights) + + @pytest.mark.parametrize('symmetric', (False, True)) + def test_output(self, symmetric): + """Ensures that the output has the desired format.""" + initial_y = self.y + try: + if symmetric: + # make data with both positive and negative peaks; roll so peaks are not overlapping + self.y = np.roll(self.y, -50) - np.roll(self.y, 50) + p = 0.5 + else: + p = 0.01 + super().test_output(p=p, symmetric=symmetric) + finally: + self.y = initial_y + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 2, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + +class TestIRSQR(IterativeSplineTester): + """Class for testing irsqr baseline.""" + + func_name = 'irsqr' + + @pytest.mark.parametrize('quantile', (-1, 2)) + def test_outside_p_fails(self, quantile): + """Ensures quantile values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, quantile=quantile) + + @pytest.mark.parametrize('diff_order', (1, 2, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.parametrize('has_x', (True, False)) + @pytest.mark.parametrize('has_z', (True, False)) + def test_no_xz(self, has_x, has_z): + """Ensures the output is not affected by not having x or z values.""" + super().test_no_xz(has_x, has_z, rtol=1e-5, atol=1e-4) + + +class TestPsplineAsLS(IterativeSplineTester): + """Class for testing pspline_asls baseline.""" + + func_name = 'pspline_asls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('p', (0.01, 0.1)) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, p, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'asls', self.y, lam=lam, p=p, diff_order=diff_order) + + +class TestPsplineIAsLS(IterativeSplineTester): + """Class for testing pspline_iasls baseline.""" + + func_name = 'pspline_iasls' + + @pytest.mark.parametrize('use_instance', (True, False)) + @pytest.mark.parametrize('weight_bool', (True, False)) + def test_unchanged_data(self, use_instance, weight_bool): + """Ensures that input data is unchanged by the function.""" + if weight_bool: + weights = np.ones_like(self.y) + else: + weights = None + super().test_unchanged_data(use_instance, weights=weights) + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, diff_order=[2, 1]) + + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('lam_1', (1e1, [1e1, 1e5])) + @pytest.mark.parametrize('p', (0.01, 0.1)) + @pytest.mark.parametrize('diff_order', (3, [2, 3])) + def test_whittaker_comparison(self, lam, lam_1, p, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker( + self, 'iasls', self.y, lam=lam, lam_1=lam_1, p=p, diff_order=diff_order, + uses_eigenvalues=False, test_rtol=1e-5 + ) + + +class TestPsplineAirPLS(IterativeSplineTester): + """Class for testing pspline_airpls baseline.""" + + func_name = 'pspline_airpls' + + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when errors occur. + + When there are no negative residuals, which occurs when a low tol value is used with + a high max_iter value, the weighting function would produce values all ~0, which + can fail the solvers. The returned baseline should be the last iteration that was + successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(utils.ParameterWarning): + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=7000 + ) + assert np.isfinite(baseline).all() + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'airpls', self.y, lam=lam, diff_order=diff_order) + + +class TestPsplineArPLS(IterativeSplineTester): + """Class for testing pspline_arpls baseline.""" + + func_name = 'pspline_arpls' + + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'arpls', self.y, lam=lam, diff_order=diff_order) + + +class TestPsplineIArPLS(IterativeSplineTester): + """Class for testing pspline_iarpls baseline.""" + + func_name = 'pspline_iarpls' + + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(utils.ParameterWarning): + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_whittaker_comparison(self, lam, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker(self, 'iarpls', self.y, lam=lam, diff_order=diff_order) + + +class TestPsplinePsalsa(IterativeSplineTester): + """Class for testing pspline_psalsa baseline.""" + + func_name = 'pspline_psalsa' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, 3, [2, 3])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.parametrize('lam', (1e1, 1e5, [1e1, 1e5])) + @pytest.mark.parametrize('p', (0.01, 0.1)) + @pytest.mark.parametrize('diff_order', (2, 3, [2, 3])) + def test_whittaker_comparison(self, lam, p, diff_order): + """Ensures the P-spline version is the same as the Whittaker version.""" + compare_pspline_whittaker( + self, 'psalsa', self.y, lam=lam, p=p, diff_order=diff_order, test_rtol=1e5 + ) + diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py new file mode 100644 index 0000000..0195717 --- /dev/null +++ b/tests/two_d/test_spline_utils.py @@ -0,0 +1,279 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.two_d._spline_utils. + +@author: Donald Erb +Created on January 8, 2024 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.sparse import issparse, kron +from scipy.sparse.linalg import spsolve + +from pybaselines.two_d import _spline_utils +from pybaselines.utils import difference_matrix +from pybaselines._compat import identity + +from ..conftest import get_2dspline_inputs + + +@pytest.mark.parametrize('num_knots', (10, 40, (10, 20))) +@pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, 5, (2, 3))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """ + Tests the accuracy of the penalized spline solvers. + + Uses the nieve way to solve 2D PSplines from Eilers's paper as the expected result, which + uses the flattened `y` and weight values, while pybaselines uses the second, more efficient + method in Eiler's paper which directly uses the 2D `y` and weights. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + x, z, y = data_fixture2d + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) + + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + # TODO replace with np.random.default_rng when min numpy version is >= 1.17 + weights = np.random.RandomState(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1).astype(float, copy=False) + + # note: within Eiler's paper, the basis was defined as kron(basis_z, basis_x), + # but the rows and columns were switched, ie. it should be kron(basis_rows, basis_columns), + # so it is just a nomenclature difference + basis = kron(basis_r, basis_c) + CWT = basis.multiply( + np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + expected_coeffs = spsolve(CWT @ basis + penalty, CWT @ y.flatten()) + expected_result = basis @ expected_coeffs + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, + lam=lam, diff_order=diff_order, check_finite=False + ) + + output = pspline.solve(y, weights=weights.reshape(y.shape)) + + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + assert_allclose(pspline.coef, expected_coeffs, rtol=1e-8, atol=1e-8) + + # also ensure that the pspline's basis can use the solved coefficients + basis_output = pspline.basis @ pspline.coef + assert_allclose(basis_output, expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('spline_degree', (1, 2, 3, [2, 3])) +@pytest.mark.parametrize('num_knots', (10, 50, [20, 30])) +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """Ensure the PSpline2D setup is correct.""" + x, z, y = data_fixture2d + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) + + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, + lam=lam, diff_order=diff_order, check_finite=False + ) + + assert pspline.basis_r.shape == (len(x), len(knots_r) - spline_degree_x - 1) + assert pspline.basis_c.shape == (len(z), len(knots_c) - spline_degree_z - 1) + assert_array_equal(pspline._num_bases, num_bases) + + assert issparse(pspline.basis_r) + assert issparse(pspline.basis_c) + + assert_allclose(pspline.basis_r.toarray(), basis_r.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.basis_c.toarray(), basis_c.toarray(), rtol=1e-12, atol=1e-12) + assert_allclose(pspline.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12) + + assert_array_equal(pspline.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(pspline.num_knots, (num_knots_r, num_knots_c)) + assert_array_equal(pspline.spline_degree, (spline_degree_x, spline_degree_z)) + assert_array_equal(pspline.lam, (lam_x, lam_z)) + assert pspline.coef is None # None since the solve method has not been called + assert pspline.basis_r.shape == (len(x), num_knots_r + spline_degree_x - 1) + assert pspline.basis_c.shape == (len(z), num_knots_c + spline_degree_z - 1) + assert_array_equal( + pspline._num_bases, + (num_knots_r + spline_degree_x - 1, num_knots_c + spline_degree_z - 1) + ) + assert pspline.knots_r.shape == (num_knots_r + 2 * spline_degree_x,) + assert pspline.knots_c.shape == (num_knots_c + 2 * spline_degree_z,) + assert isinstance(pspline.x, np.ndarray) + assert isinstance(pspline.z, np.ndarray) + + # _basis should be None since the basis attribute has not been accessed yet + assert pspline._basis is None + + expected_basis = kron(basis_r, basis_c).toarray() + + assert_allclose(pspline.basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) + assert_allclose(pspline._basis.toarray(), expected_basis, rtol=1e-12, atol=1e-12) + + +def test_pspline_same_basis(data_fixture2d): + """Ensures PSpline2D.same_basis works correctly.""" + x, z, y = data_fixture2d + + num_knots = (20, 30) + spline_degree = (2, 3) + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, check_finite=False + ) + + assert pspline.same_basis(num_knots, spline_degree) + assert not pspline.same_basis(num_knots[::-1], spline_degree) + assert not pspline.same_basis(num_knots, spline_degree[::-1]) + assert not pspline.same_basis(10, spline_degree) + assert not pspline.same_basis(num_knots, 1) + assert not pspline.same_basis(10, 1) + + +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) +def test_pspline_diff_order_zero_fails(data_fixture2d, diff_order): + """Ensures a difference order of 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, diff_order=diff_order) + + +@pytest.mark.parametrize('spline_degree', (-2, -1, [-1, 1], [1, -1])) +def test_pspline_negative_spline_degree_fails(data_fixture2d, spline_degree): + """Ensures a spline degree less than 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, spline_degree=spline_degree) + + +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_pspline_negative_lam_fails(data_fixture2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + x, z, y = data_fixture2d + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, lam=lam) + + +def test_pspline_non_finite_fails(): + """Ensure non-finite values raise an exception when check_finite is True.""" + x = np.linspace(-1, 1, 100) + z = np.linspace(-1, 1, 50) + original_x_value = x[0] + original_z_value = z[0] + for value in (np.nan, np.inf, -np.inf): + x[0] = value + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, check_finite=True) + x[0] = original_x_value + + for value in (np.nan, np.inf, -np.inf): + z[0] = value + with pytest.raises(ValueError): + _spline_utils.PSpline2D(x, z, check_finite=True) + z[0] = original_z_value + + +@pytest.mark.parametrize('spline_degree', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('num_knots', (10, 40, (20, 30))) +@pytest.mark.parametrize('diff_order', (1, 2, (1, 2))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_pspline_tck(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """Ensures the tck attribute can correctly recreate the solved spline.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, diff_order=diff_order, lam=lam + ) + fit_spline = pspline.solve(y, weights=np.ones_like(y)) + + # ensure tck is the knots, coefficients, and spline degree + assert len(pspline.tck) == 3 + (knots_r, knots_c), coeffs, (degree_x, degree_z) = pspline.tck + + assert_allclose(knots_r, pspline.knots_r, rtol=1e-12, atol=1e-12) + assert_allclose(knots_c, pspline.knots_c, rtol=1e-12, atol=1e-12) + assert_allclose(coeffs, pspline.coef.reshape(pspline._num_bases), rtol=1e-12, atol=1e-12) + if isinstance(spline_degree, int): + assert degree_x == spline_degree + assert degree_z == spline_degree + else: + assert degree_x == spline_degree[0] + assert degree_z == spline_degree[1] + + # Now recreate the spline with scipy's NdBSpline and ensure it is the same; + # NdBSpline was introduced in scipy 1.12.0 + import scipy + major, minor = [int(val) for val in scipy.__version__.split('.')[:2]] + if major > 1 or (major == 1 and minor >= 12): + from scipy.interpolate import NdBSpline + # np.array(np.meshgrid(x, z)).T is the same as doing + # np.array(np.meshgrid(x, z, indexing='ij')).transpose([1, 2, 0]), which + # is just zipping the meshgrid of each x and z value + recreated_spline = NdBSpline(*pspline.tck)(np.array(np.meshgrid(x, z)).T) + + assert_allclose(recreated_spline, fit_spline, rtol=1e-10, atol=1e-12) + + +def test_pspline_tck_none(data_fixture2d): + """Ensures an exception is raised when tck attribute is accessed without first solving once.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D(x, z) + + assert pspline.coef is None + with pytest.raises(ValueError): + tck = pspline.tck + + +def test_pspline_tck_readonly(data_fixture2d): + """Ensures the tck attribute is read-only.""" + x, z, y = data_fixture2d + pspline = _spline_utils.PSpline2D(x, z) + + with pytest.raises(AttributeError): + pspline.tck = (1, 2, 3) + + pspline.solve(y, np.ones_like(y)) + with pytest.raises(AttributeError): + pspline.tck = (1, 2, 3) diff --git a/tests/two_d/test_whittaker.py b/tests/two_d/test_whittaker.py new file mode 100644 index 0000000..ccc6d90 --- /dev/null +++ b/tests/two_d/test_whittaker.py @@ -0,0 +1,305 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines.whittaker. + +@author: Donald Erb +Created on March 20, 2021 + +""" + +import numpy as np +import pytest + +from pybaselines.two_d import whittaker +from pybaselines.utils import ParameterWarning + +from ..conftest import BaseTester2D, InputWeightsMixin + + +class WhittakerTester(BaseTester2D, InputWeightsMixin): + """Base testing class for whittaker functions.""" + + module = whittaker + algorithm_base = whittaker._Whittaker + checked_keys = ('weights', 'tol_history') + + def test_tol_history(self): + """Ensures the 'tol_history' item in the parameter output is correct.""" + max_iter = 5 + _, params = self.class_func(self.y, max_iter=max_iter, tol=-1) + + assert params['tol_history'].size == max_iter + 1 + + +class TestAsLS(WhittakerTester): + """Class for testing asls baseline.""" + + func_name = 'asls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + +class TestIAsLS(WhittakerTester): + """Class for testing iasls baseline.""" + + func_name = 'iasls' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (2, [3, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[2, 1]) + + +class TestAirPLS(WhittakerTester): + """Class for testing airpls baseline.""" + + func_name = 'airpls' + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when errors occur. + + When there are no negative residuals, which occurs when a low tol value is used with + a high max_iter value, the weighting function would produce values all ~0, which + can fail the solvers. The returned baseline should be the last iteration that was + successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=3000 + ) + + assert np.isfinite(baseline).all() + + +class TestArPLS(WhittakerTester): + """Class for testing arpls baseline.""" + + func_name = 'arpls' + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + + +class TestDrPLS(WhittakerTester): + """Class for testing drpls baseline.""" + + func_name = 'drpls' + + @pytest.mark.parametrize('eta', (-1, 2)) + def test_outside_eta_fails(self, eta): + """Ensures eta values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, eta=eta) + + @pytest.mark.parametrize('diff_order', (2, [3, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + def test_diff_order_one_fails(self): + """Ensure that a difference order of 1 raises an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=1) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 1]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[1, 2]) + with pytest.raises(ValueError): + self.class_func(self.y, lam=1e2, diff_order=[2, 1]) + + @pytest.mark.skip(reason='test is too slow') + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + +class TestIArPLS(WhittakerTester): + """Class for testing iarpls baseline.""" + + func_name = 'iarpls' + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.skip(reason='test is too slow') + # ignore the RuntimeWarning that occurs from using +/- inf or nan + @pytest.mark.filterwarnings('ignore::RuntimeWarning') + def test_avoid_nonfinite_weights(self, no_noise_data_fixture2d): + """ + Ensures that the function gracefully exits when non-finite weights are created. + + When there are no negative residuals or exp(iterations) / std is very high, both + of which occur when a low tol value is used with a high max_iter value, the + weighting function would produce non-finite values. The returned baseline should + be the last iteration that was successful, and thus should not contain nan or +/- inf. + + Use data without noise since the lack of noise makes it easier to induce failure. + Set tol to -1 so that it is never reached, and set max_iter to a high value. + Uses np.isfinite on the dot product of the baseline since the dot product is fast, + would propogate the nan or inf, and will create only a single value to check + for finite-ness. + + """ + x, z, y = no_noise_data_fixture2d + with pytest.warns(ParameterWarning): + baseline, params = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + # ensure last tolerence calculation was non-finite as a double-check that + # this test is actually doing what it should be doing + assert not np.isfinite(params['tol_history'][-1]) + + +class TestAsPLS(WhittakerTester): + """Class for testing aspls baseline.""" + + func_name = 'aspls' + checked_keys = ('weights', 'alpha', 'tol_history') + weight_keys = ('weights', 'alpha') + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) + + @pytest.mark.parametrize('alpha_enum', (0, 1)) + def test_wrong_alpha_shape(self, alpha_enum): + """Ensures that an exception is raised if input alpha and data are different shapes.""" + if alpha_enum == 0: + alpha = np.ones(np.array(self.y.shape) + 1) + else: + alpha = np.ones(self.y.size) + with pytest.raises(ValueError): + self.class_func(self.y, alpha=alpha) + + @pytest.mark.skip(reason='test is too slow') + def test_avoid_overflow_warning(self, no_noise_data_fixture2d): + """ + Ensures no warning is emitted for exponential overflow. + + The weighting is 1 / (1 + exp(values)), so if values is too high, + exp(values) is inf, which should usually emit an overflow warning. + However, the resulting weight is 0, which is fine, so the warning is + not needed and should be avoided. This test ensures the overflow warning + is not emitted, and also ensures that the output is all finite, just in + case the weighting was not actually stable. + + """ + x, z, y = no_noise_data_fixture2d + with np.errstate(over='raise'): + baseline, _ = getattr(self.algorithm_base(x, z), self.func_name)( + y, tol=-1, max_iter=1000 + ) + + assert np.isfinite(baseline).all() + + +class TestPsalsa(WhittakerTester): + """Class for testing psalsa baseline.""" + + func_name = 'psalsa' + + @pytest.mark.parametrize('p', (-1, 2)) + def test_outside_p_fails(self, p): + """Ensures p values outside of [0, 1] raise an exception.""" + with pytest.raises(ValueError): + self.class_func(self.y, p=p) + + @pytest.mark.parametrize('diff_order', (1, [1, 2])) + def test_diff_orders(self, diff_order): + """Ensure that other difference orders work.""" + self.class_func(self.y, diff_order=diff_order) diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py new file mode 100644 index 0000000..962fb1e --- /dev/null +++ b/tests/two_d/test_whittaker_utils.py @@ -0,0 +1,347 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines._banded_utils. + +@author: Donald Erb +Created on Dec. 11, 2021 + +""" + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest +from scipy.linalg import eig_banded +from scipy.sparse import issparse, kron +from scipy.sparse.linalg import spsolve + +from pybaselines._banded_utils import diff_penalty_diagonals +from pybaselines._compat import identity, dia_object +from pybaselines.two_d import _spline_utils, _whittaker_utils +from pybaselines.utils import difference_matrix + +from ..conftest import get_2dspline_inputs + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_penalized_system(small_data2d, diff_order, lam): + """ + Tests the accuracy of the penalized system solver. + + Not really useful at the moment, but will be more useful if the solver changes + from the current basic sparse solver. + + """ + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.flatten()) + output = penalized_system.solve(small_data2d.flatten(), weights) + + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_penalized_system_setup(small_data2d, diff_order, lam): + """Ensure the PenalizedSystem2D setup is correct.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + assert_array_equal(penalized_system._num_bases, num_bases) + + assert issparse(penalized_system.penalty) + assert_allclose( + penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 + ) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) +def test_penalized_system_diff_order_fails(small_data2d, diff_order): + """Ensures a difference order of less than 1 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.PenalizedSystem2D(small_data2d.shape, diff_order=diff_order) + + +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_penalized_system_negative_lam_fails(small_data2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.PenalizedSystem2D(small_data2d.shape, lam=lam) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_compare_to_psplines(data_fixture2d, lam, diff_order): + """ + Ensures 2D Whittaker and PSpline outputs are the same for specific condition. + + If the number of basis functions for splines is equal to the number of data points, and + the spline degree is set to 0, then the spline basis becomes the identity function + and should produce the same analytical equation as Whittaker smoothing. + + Since the 2D PSpline case is known from Eiler's paper, and the implementation of + 2D Whittaker smoothing in pybaselines was adapted from that, need to verify the Whittaker + smoothing implementation. + + """ + x, z, y = data_fixture2d + + pspline = _spline_utils.PSpline2D( + x, z, num_knots=(len(x) + 1, len(z) + 1), spline_degree=0, lam=lam, diff_order=diff_order, + check_finite=False + ) + + # sanity check to ensure it was set up correctly + assert_array_equal(pspline.basis_r.shape, (len(x), len(x))) + assert_array_equal(pspline.basis_c.shape, (len(z)), len(z)) + + whittaker_system = _whittaker_utils.PenalizedSystem2D( + y.shape, lam=lam, diff_order=diff_order + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, y.shape) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False) + + spline_output = pspline.solve(y, weights=weights) + whittaker_output = whittaker_system.solve(y.ravel(), weights=weights.ravel()) + + assert_allclose(whittaker_output.reshape(y.shape), spline_output, rtol=1e-12, atol=1e-12) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +def test_penalized_system_add_penalty(diff_order): + """Tests adding a penalty to a PenalizedSystem2D.""" + data_size = (40, 51) + lam = 5 + + whittaker_system = _whittaker_utils.PenalizedSystem2D( + data_size, lam=lam, diff_order=diff_order + ) + added_penalty = 5 * identity(np.prod(data_size)) + + expected_output = (added_penalty + whittaker_system.penalty).toarray() + expected_diagonal = expected_output.diagonal() + + output = whittaker_system.add_penalty(added_penalty) + + assert_allclose(output.toarray(), expected_output, rtol=1e-12, atol=1e-13) + # should also modify the penalty attribute + assert_allclose(whittaker_system.penalty.toarray(), expected_output, rtol=1e-12, atol=1e-13) + # and the main diagonal + assert_allclose(whittaker_system.main_diagonal, expected_diagonal, rtol=1e-12, atol=1e-13) + + +def test_face_splitting(): + """Ensures the face-splittng algorithms works as intended.""" + basis = np.array([ + [1., 2, 3], + [4, 5, 6], + [7, 8, 9], + [10, 11, 12] + ]) + + output = _whittaker_utils._face_splitting(basis) + + assert output.shape == (basis.shape[0], basis.shape[1]**2) + assert issparse(output) + + expected_output = kron(basis, np.ones((1, basis.shape[1]))).multiply( + kron(np.ones((1, basis.shape[1])), basis) + ) + assert_allclose(output.toarray(), expected_output.toarray(), rtol=0, atol=1e-12) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_solve_whittaker_system(small_data2d, diff_order, lam): + """ + Tests the accuracy of the Whittaker system solver when not using eigendecomposition. + + Not really useful at the moment, but will be more useful if the solver changes + from the current basic sparse solver. + + """ + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=None + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.flatten()) + output = penalized_system.solve(small_data2d.flatten(), weights) + + assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_whittaker_system_setup_no_eigenvalues(small_data2d, diff_order, lam): + """Ensure the WhittakerSystem2D setup is correct when not using eigendecomposition.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=None + ) + + assert_array_equal(penalized_system._num_bases, num_bases) + + assert issparse(penalized_system.penalty) + assert_allclose( + penalized_system.penalty.toarray(), penalty.toarray(), rtol=1e-12, atol=1e-12 + ) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) +@pytest.mark.parametrize('lam', (5, (3, 5))) +def test_whittaker_system_setup_eigenvalues(small_data2d, diff_order, lam): + """Ensure the WhittakerSystem2D setup is correct when using eigendecomposition.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + max_eigens = np.array([5, 10]) + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, max_eigens=max_eigens + ) + + assert_array_equal(penalized_system._num_bases, max_eigens) + + eigenvalues_rows, expected_basis_rows = eig_banded( + diff_penalty_diagonals(small_data2d.shape[0], diff_order_x, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, max_eigens[0] - 1) + ) + penalty_rows = kron( + lam_x * dia_object((eigenvalues_rows, 0), shape=(max_eigens[0], max_eigens[0])), + identity(max_eigens[1]) + ) + + eigenvalues_cols, expected_basis_cols = eig_banded( + diff_penalty_diagonals(small_data2d.shape[1], diff_order_z, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, max_eigens[1] - 1) + ) + penalty_cols = kron( + identity(max_eigens[0]), + lam_z * dia_object((eigenvalues_cols, 0), shape=(max_eigens[1], max_eigens[1])) + ) + + assert penalized_system.penalty.shape == (np.prod(max_eigens),) + assert_allclose( + penalized_system.penalty, (penalty_rows + penalty_cols).diagonal(), rtol=1e-12, atol=1e-12 + ) + assert_allclose( + penalized_system.basis_r, expected_basis_rows, rtol=1e-12, atol=1e-12 + ) + assert_allclose( + penalized_system.basis_c, expected_basis_cols, rtol=1e-12, atol=1e-12 + ) + + assert_array_equal(penalized_system.diff_order, (diff_order_x, diff_order_z)) + assert_array_equal(penalized_system.lam, (lam_x, lam_z)) + + +@pytest.mark.parametrize('diff_order', (0, -1, [0, 0], [1, 0], [0, 1], [-1, 1], [1, -1])) +def test_whittaker_system_diff_order_fails(small_data2d, diff_order): + """Ensures a difference order of less than 1 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, diff_order=diff_order, max_eigens=None + ) + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, diff_order=diff_order, max_eigens=(5, 5) + ) + + +@pytest.mark.parametrize('lam', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_whittaker_system_negative_lam_fails(small_data2d, lam): + """Ensures a lam value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D(small_data2d.shape, lam=lam, max_eigens=None) + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, max_eigens=(5, 5) + ) + + +@pytest.mark.parametrize('max_eigens', (-2, 0, [-1, 1], [1, -1], [1, 0], [0, 1])) +def test_whittaker_system_negative_maxeigens_fails(small_data2d, max_eigens): + """Ensures a max_eigens value less than or equal to 0 fails.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, max_eigens=max_eigens + ) + + +@pytest.mark.parametrize('max_eigens', ([None, 5], [3, None], np.array([None, 6]))) +def test_whittaker_system_None_and_nonNone_maxeigens_fails(small_data2d, max_eigens): + """Ensures a max_eigens cannot mix None with a non-None value.""" + with pytest.raises(ValueError): + _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, max_eigens=max_eigens + )