diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index 4cc7097..a5bbdfa 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -7,7 +7,7 @@ __all__ = ("pqm_chi2", "pqm_pvalue") -def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int): +def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, whiten: bool): """ Helper function to perform the PQM test and return the results from chi2_contingency. @@ -19,6 +19,8 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int): Samples from the second distribution, reference samples. num_refs : int Number of reference samples to use. + whiten : bool + If True, whiten the samples by subtracting the mean and dividing by the standard deviation. Returns ------- @@ -33,6 +35,11 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int): print( "Warning: Number of y_samples is small (less than twice the number of reference samples). Result may have high variance." ) + if whiten: + mean = np.mean(y_samples, axis=0) + std = np.std(y_samples, axis=0) + y_samples = (y_samples - mean) / std + x_samples = (x_samples - mean) / std refs = np.random.choice(len(y_samples), num_refs, replace=False) N = np.arange(len(y_samples)) @@ -60,6 +67,7 @@ def pqm_pvalue( y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None, + whiten: bool = False, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. @@ -74,6 +82,8 @@ def pqm_pvalue( Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array. bootstrap : Optional[int] Number of bootstrap iterations to perform. No bootstrap if None (default). + whiten : bool + If True, whiten the samples by subtracting the mean and dividing by the standard deviation. Returns ------- @@ -81,8 +91,11 @@ def pqm_pvalue( pvalue(s). Null hypothesis that both samples are drawn from the same distribution. """ if bootstrap is not None: - return [pqm_pvalue(x_samples, y_samples, num_refs=num_refs) for _ in range(bootstrap)] - _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs) + return [ + pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten) + for _ in range(bootstrap) + ] + _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten) return pvalue @@ -91,6 +104,7 @@ def pqm_chi2( y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None, + whiten: bool = False, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. @@ -105,6 +119,8 @@ def pqm_chi2( Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array. bootstrap : Optional[int] Number of bootstrap iterations to perform. No bootstrap if None (default). + whiten : bool + If True, whiten the samples by subtracting the mean and dividing by the standard deviation. Returns ------- @@ -112,8 +128,11 @@ def pqm_chi2( chi2 statistic(s) and degree(s) of freedom. """ if bootstrap is not None: - return [pqm_chi2(x_samples, y_samples, num_refs=num_refs) for _ in range(bootstrap)] - chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs) + return [ + pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten) + for _ in range(bootstrap) + ] + chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten) if dof != num_refs - 1: # Rescale chi2 to new value which has the same cumulative probability cp = chi2.cdf(chi2_stat, dof)