From 051ffb8e974569a733d88d7ed63f3712a90ac04f Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 31 May 2024 12:30:10 -0400
Subject: [PATCH] fix dof to always be the same for chi2 result

---
 src/pqm/pqm.py         | 26 ++++++++++++++++++++++----
 tests/test_gaussian.py | 37 +++++++++++++++++++++++++++++++++++--
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index 9da9111..4cc7097 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -1,11 +1,12 @@
 from typing import Optional
 
 import numpy as np
-from scipy.stats import chi2_contingency
+from scipy.stats import chi2_contingency, chi2
 from scipy.spatial import KDTree
 
 __all__ = ("pqm_chi2", "pqm_pvalue")
 
+
 def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int):
     """
     Helper function to perform the PQM test and return the results from chi2_contingency.
@@ -53,7 +54,13 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int):
 
     return chi2_contingency(np.array([counts_x, counts_y]))
 
-def pqm_pvalue(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None):
+
+def pqm_pvalue(
+    x_samples: np.ndarray,
+    y_samples: np.ndarray,
+    num_refs: int = 100,
+    bootstrap: Optional[int] = None,
+):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
 
@@ -78,7 +85,13 @@ def pqm_pvalue(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100
     _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs)
     return pvalue
 
-def pqm_chi2(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100, bootstrap: Optional[int] = None):
+
+def pqm_chi2(
+    x_samples: np.ndarray,
+    y_samples: np.ndarray,
+    num_refs: int = 100,
+    bootstrap: Optional[int] = None,
+):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
 
@@ -101,4 +114,9 @@ def pqm_chi2(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int = 100,
     if bootstrap is not None:
         return [pqm_chi2(x_samples, y_samples, num_refs=num_refs) for _ in range(bootstrap)]
     chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs)
-    return chi2_stat, dof
\ No newline at end of file
+    if dof != num_refs - 1:
+        # Rescale chi2 to new value which has the same cumulative probability
+        cp = chi2.cdf(chi2_stat, dof)
+        chi2_stat = chi2.ppf(cp, num_refs - 1)
+        dof = num_refs - 1
+    return chi2_stat, dof
diff --git a/tests/test_gaussian.py b/tests/test_gaussian.py
index 2c113b8..4e3eb1c 100644
--- a/tests/test_gaussian.py
+++ b/tests/test_gaussian.py
@@ -1,8 +1,8 @@
 import numpy as np
-from pqm import pqm_pvalue
+from pqm import pqm_pvalue, pqm_chi2
 
 
-def test():
+def test_pass_pvalue():
     new = []
     for _ in range(100):
         y_samples = np.random.normal(size=(500, 50))
@@ -11,3 +11,36 @@ def test():
         new.append(pqm_pvalue(x_samples, y_samples))
 
     assert np.abs(np.mean(new) - 0.5) < 0.15
+
+
+def test_pass_chi2():
+    new = []
+    for _ in range(100):
+        y_samples = np.random.normal(size=(500, 50))
+        x_samples = np.random.normal(size=(250, 50))
+
+        new.append(pqm_chi2(x_samples, y_samples, num_refs=100))
+    new = np.array(new)
+    assert np.abs(np.mean(new[:, 0]) / 99 - 1) < 0.15
+
+
+def test_fail_pvalue():
+    new = []
+    for _ in range(100):
+        y_samples = np.random.normal(size=(500, 50))
+        x_samples = np.random.normal(size=(250, 50)) + 0.5
+
+        new.append(pqm_pvalue(x_samples, y_samples))
+
+    assert np.mean(new) < 1e-3
+
+
+def test_fail_chi2():
+    new = []
+    for _ in range(100):
+        y_samples = np.random.normal(size=(500, 50))
+        x_samples = np.random.normal(size=(250, 50)) + 0.5
+
+        new.append(pqm_chi2(x_samples, y_samples, num_refs=100))
+    new = np.array(new)
+    assert np.mean(new[:, 0]) / 99 > 10