From 051ffb8e974569a733d88d7ed63f3712a90ac04f Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 31 May 2024 12:30:10 -0400 Subject: [PATCH] fix dof to always be the same for chi2 result --- src/pqm/pqm.py | 26 ++++++++++++++++++++++---- tests/test_gaussian.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index 9da9111..4cc7097 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -1,11 +1,12 @@ from typing import Optional import numpy as np -from scipy.stats import chi2_contingency +from scipy.stats import chi2_contingency, chi2 from scipy.spatial import KDTree __all__ = ("pqm_chi2", "pqm_pvalue") + def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int): """ Helper function to perform the PQM test and return the results from chi2_contingency. @@ -53,7 +54,13 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int): return chi2_contingency(np.array([counts_x, counts_y])) -def pqm_pvalue(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None): + +def pqm_pvalue( + x_samples: np.ndarray, + y_samples: np.ndarray, + num_refs: int = 100, + bootstrap: Optional[int] = None, +): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. @@ -78,7 +85,13 @@ def pqm_pvalue(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100 _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs) return pvalue -def pqm_chi2(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None): + +def pqm_chi2( + x_samples: np.ndarray, + y_samples: np.ndarray, + num_refs: int = 100, + bootstrap: Optional[int] = None, +): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. @@ -101,4 +114,9 @@ def pqm_chi2(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100, if bootstrap is not None: return [pqm_chi2(x_samples, y_samples, num_refs=num_refs) for _ in range(bootstrap)] chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs) - return chi2_stat, dof \ No newline at end of file + if dof != num_refs - 1: + # Rescale chi2 to new value which has the same cumulative probability + cp = chi2.cdf(chi2_stat, dof) + chi2_stat = chi2.ppf(cp, num_refs - 1) + dof = num_refs - 1 + return chi2_stat, dof diff --git a/tests/test_gaussian.py b/tests/test_gaussian.py index 2c113b8..4e3eb1c 100644 --- a/tests/test_gaussian.py +++ b/tests/test_gaussian.py @@ -1,8 +1,8 @@ import numpy as np -from pqm import pqm_pvalue +from pqm import pqm_pvalue, pqm_chi2 -def test(): +def test_pass_pvalue(): new = [] for _ in range(100): y_samples = np.random.normal(size=(500, 50)) @@ -11,3 +11,36 @@ def test(): new.append(pqm_pvalue(x_samples, y_samples)) assert np.abs(np.mean(new) - 0.5) < 0.15 + + +def test_pass_chi2(): + new = [] + for _ in range(100): + y_samples = np.random.normal(size=(500, 50)) + x_samples = np.random.normal(size=(250, 50)) + + new.append(pqm_chi2(x_samples, y_samples, num_refs=100)) + new = np.array(new) + assert np.abs(np.mean(new[:, 0]) / 99 - 1) < 0.15 + + +def test_fail_pvalue(): + new = [] + for _ in range(100): + y_samples = np.random.normal(size=(500, 50)) + x_samples = np.random.normal(size=(250, 50)) + 0.5 + + new.append(pqm_pvalue(x_samples, y_samples)) + + assert np.mean(new) < 1e-3 + + +def test_fail_chi2(): + new = [] + for _ in range(100): + y_samples = np.random.normal(size=(500, 50)) + x_samples = np.random.normal(size=(250, 50)) + 0.5 + + new.append(pqm_chi2(x_samples, y_samples, num_refs=100)) + new = np.array(new) + assert np.mean(new[:, 0]) / 99 > 10