baselines.py

import numpy as np
from scipy import sparse
from scipy.sparse.linalg import spsolve
import pyqtgraph as pg

def savitzky_golay(y, window_size, order, deriv=0, rate=1):
    """From SciPy cookbook
https://scipy-cookbook.readthedocs.io/items/SavitzkyGolay.html

Not particularly useful for undersampled data!!

Smooth (and optionally differentiate) data with a Savitzky-Golay filter.
The Savitzky-Golay filter removes high frequency noise from data.
It has the advantage of preserving the original shape and
features of the signal better than other types of filtering
approaches, such as moving averages techniques.
Parameters
----------
y : array_like, shape (N,)
    the values of the time history of the signal.
window_size : int
    the length of the window. Must be an odd integer number.
order : int
    the order of the polynomial used in the filtering.
    Must be less then `window_size` - 1.
deriv: int
    the order of the derivative to compute (default = 0 means only smoothing)
Returns
-------
ys : ndarray, shape (N)
    the smoothed signal (or it's n-th derivative).
Notes
-----
The Savitzky-Golay is a type of low-pass filter, particularly
suited for smoothing noisy data. The main idea behind this
approach is to make for each point a least-square fit with a
polynomial of high order over a odd-sized window centered at
the point.
Examples
--------
t = np.linspace(-4, 4, 500)
y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape)
ysg = savitzky_golay(y, window_size=31, order=4)
import matplotlib.pyplot as plt
plt.plot(t, y, label='Noisy signal')
plt.plot(t, np.exp(-t**2), 'k', lw=1.5, label='Original signal')
plt.plot(t, ysg, 'r', label='Filtered signal')
plt.legend()
plt.show()
References
----------
.. [1] A. Savitzky, M. J. E. Golay, Smoothing and Differentiation of
   Data by Simplified Least Squares Procedures. Analytical
   Chemistry, 1964, 36 (8), pp 1627-1639.
.. [2] Numerical Recipes 3rd Edition: The Art of Scientific Computing
   W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
   Cambridge University Press ISBN-13: 9780521880688
    """
    #import numpy as np
    from math import factorial

    try:
        window_size = np.abs(np.int(window_size))
        order = np.abs(np.int(order))
    except ValueError as msg:
        print (str(msg))
        raise ValueError("window_size and order have to be of type int")
    if window_size % 2 != 1 or window_size < 1:
        raise TypeError("window_size size must be a positive odd number")
    if window_size < order + 2:
        raise TypeError("window_size is too small for the polynomials order")
    order_range = range(order+1)
    half_window = (window_size -1) // 2
    # precompute coefficients
    b = np.mat([[k**i for i in order_range] for k in range(-half_window, half_window+1)])
    m = np.linalg.pinv(b).A[deriv] * rate**deriv * factorial(deriv)
    # pad the signal at the extremes with
    # values taken from the signal itself
    firstvals = y[0] - np.abs( y[1:half_window+1][::-1] - y[0] )
    lastvals = y[-1] + np.abs(y[-half_window-1:-1][::-1] - y[-1])
    y = np.concatenate((firstvals, y, lastvals))
    return np.convolve( m[::-1], y, mode='valid')
    
def baseline_als(y, lam, p, niter=20, quiet=False):
    
    """ y is a numpy array, niter is the number of iterations
    
    from https://stackoverflow.com/questions/29156532
    Reference: "Asymmetric Least Squares Smoothing" by P. Eilers and H. Boelens in 2005.
    There are two parameters: p for asymmetry and λ for smoothness. Both have to be tuned to the data at hand. We found that generally
    0.001 ≤ p ≤ 0.1 is a good choice (for a signal with positive peaks) and 10^2 ≤ λ ≤ 10^9 , but exceptions may occur. In any case one
    should vary λ on a grid that is approximately linear for log λ"""
    
    if not quiet: print('Asymmetric Baseline subtraction with lambda {0:.3f} and p {1:.3f}.'.format(lam, p))
    L = len(y)
    D = sparse.diags([1, -2, 1],[0, -1, -2], shape=(L, L-2))
    w = np.ones(L)
    for i in range(niter):
        WW = sparse.spdiags(w, 0, L, L)
        ZZ = WW + lam * D.dot(D.transpose())
        z = spsolve(ZZ, w * y)
        w = p * (y > z) + (1-p) * (y < z)
    return z

def baselineIterator(data, lam, p, niter=20):
    """ iterate baseline subtraction over dictionary of dataframes of ROI traces """
    
    #is there a problem with running it twice? No, the problem was peaks were being chucked out
    
    bdata = {}
    for _set, df in data.items():
        print ("Auto baseline for {0} set. lambda: {1:.3f} and p: {2:.3f}".format(_set, lam, p))
        
        maxVal = len (df.columns)
        progMsg = "Auto baseline for {0} traces".format(maxVal)
        with pg.ProgressDialog(progMsg, 0, maxVal) as dlg:
            dlg.setMinimumWidth(300)
            for col in df:
                dlg += 1
                y = np.asarray(df[col])
                # subtract appropriate baseline from each column of df
                df[col] -= baseline_als(y, lam, p, niter=20, quiet=True)
        bdata[_set] = df
    return bdata