From 8930f1080220c1fdeca4e754c7c530bad5893928 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Fri, 2 Aug 2024 10:37:54 -0400
Subject: [PATCH 1/3] feat: reference points can now be drawn from either x/y
 sample

---
 src/pqm/pqm.py | 121 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 94 insertions(+), 27 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index ed1601f..a1d9099 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -7,9 +7,16 @@
 __all__ = ("pqm_chi2", "pqm_pvalue")
 
 
-def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, whiten: bool):
+def _pqm_test(
+    x_samples: np.ndarray,
+    y_samples: np.ndarray,
+    num_refs: int,
+    whiten: bool,
+    x_frac: Optional[float] = None,
+):
     """
-    Helper function to perform the PQM test and return the results from chi2_contingency.
+    Helper function to perform the PQM test and return the results from
+    chi2_contingency.
 
     Parameters
     ----------
@@ -20,7 +27,14 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white
     num_refs : int
         Number of reference samples to use.
     whiten : bool
-        If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
+        If True, whiten the samples by subtracting the mean and dividing by the
+        standard deviation.
+    x_frac : float
+        Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
+        use only x_samples as reference samples, ``x_frac = 0`` will use only
+        y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
+        (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
+        None (default).
 
     Returns
     -------
@@ -41,12 +55,34 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white
         y_samples = (y_samples - mean) / std
         x_samples = (x_samples - mean) / std
 
-    refs = np.random.choice(len(y_samples), num_refs, replace=False)
-    N = np.arange(len(y_samples))
-    N[refs] = -1
-    N = N[N >= 0]
-    refs, y_samples = y_samples[refs], y_samples[N]
-
+    # Determine fraction of x_samples to use as reference samples
+    if x_frac is None:
+        x_frac = len(x_samples) / (len(x_samples) + len(y_samples))
+
+    # Collect reference samples from x_samples
+    if x_frac > 0:
+        xrefs = np.random.choice(len(x_samples), int(x_frac * num_refs), replace=False)
+        N = np.arange(len(x_samples))
+        N[xrefs] = -1
+        N = N[N >= 0]
+        xrefs, x_samples = x_samples[xrefs], x_samples[N]
+    else:
+        xrefs = np.zeros((0,) + x_samples.shape[1:])
+
+    # Collect reference samples from y_samples
+    if x_frac < 1:
+        yrefs = np.random.choice(len(y_samples), int((1.0 - x_frac) * num_refs), replace=False)
+        N = np.arange(len(y_samples))
+        N[yrefs] = -1
+        N = N[N >= 0]
+        yrefs, y_samples = y_samples[yrefs], y_samples[N]
+    else:
+        yrefs = np.zeros((0,) + y_samples.shape[1:])
+
+    # Join the full set of reference samples
+    refs = np.concatenate([xrefs, yrefs], axis=0)
+
+    # Build KDtree to measure distances
     tree = KDTree(refs)
 
     idx = tree.query(x_samples, k=1, workers=-1)[1]
@@ -68,34 +104,50 @@ def pqm_pvalue(
     num_refs: int = 100,
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
+    x_frac: Optional[float] = None,
 ):
     """
-    Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
+    Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
+    are drawn form the same distribution.
 
     Parameters
     ----------
     x_samples : np.ndarray
-        Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples.
+        Samples from the first distribution, test samples. Must have shape (N,
+        *D) N is the number of x samples, and D is the dimensionality of the
+        samples.
     y_samples : np.ndarray
-        Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
+        Samples from the second distribution, reference samples. Must have shape
+        (M, *D) M is the number of y samples, and D is the dimensionality of the
+        samples.
     num_refs : int
-        Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
+        Number of reference samples to use. Note that these will be drawn from
+        y_samples, and then removed from the y_samples array.
     re_tessellation : Optional[int]
-        Number of times pqm_pvalue is called, re tesselating the space. No re_tessellation if None (default).
+        Number of times pqm_pvalue is called, re-tesselating the space. No
+        re_tessellation if None (default).
     whiten : bool
-        If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
+        If True, whiten the samples by subtracting the mean and dividing by the
+        standard deviation.
+    x_frac : float
+        Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
+        use only x_samples as reference samples, ``x_frac = 0`` will use only
+        y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
+        (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
+        None (default).
 
     Returns
     -------
     float or list
-        pvalue(s). Null hypothesis that both samples are drawn from the same distribution.
+        pvalue(s). Null hypothesis that both samples are drawn from the same
+        distribution.
     """
     if re_tessellation is not None:
         return [
-            pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
+            pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac)
             for _ in range(re_tessellation)
         ]
-    _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
+    _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac)
     return pvalue
 
 
@@ -105,22 +157,37 @@ def pqm_chi2(
     num_refs: int = 100,
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
+    x_frac: Optional[float] = None,
 ):
     """
-    Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
+    Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
+    are drawn form the same distribution.
 
     Parameters
     ----------
     x_samples : np.ndarray
-        Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples.
+        Samples from the first distribution, test samples. Must have shape (N,
+        *D) N is the number of x samples, and D is the dimensionality of the
+        samples.
     y_samples : np.ndarray
-        Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
+        Samples from the second distribution, reference samples. Must have shape
+        (M, *D) M is the number of y samples, and D is the dimensionality of the
+        samples.
     num_refs : int
-        Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
+        Number of reference samples to use. Note that these will be drawn from
+        y_samples, and then removed from the y_samples array.
     re_tessellation : Optional[int]
-        Number of times pqm_chi2 is called, re tesselating the space. No re_tessellation if None (default).
+        Number of times pqm_chi2 is called, re-tesselating the space. No
+        re_tessellation if None (default).
     whiten : bool
-        If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
+        If True, whiten the samples by subtracting the mean and dividing by the
+        standard deviation.
+    x_frac : float
+        Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
+        use only x_samples as reference samples, ``x_frac = 0`` will use only
+        y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
+        (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
+        None (default).
 
     Returns
     -------
@@ -129,10 +196,10 @@ def pqm_chi2(
     """
     if re_tessellation is not None:
         return [
-            pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
+            pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac)
             for _ in range(re_tessellation)
         ]
-    chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
+    chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac)
     if dof != num_refs - 1:
         # Rescale chi2 to new value which has the same cumulative probability
         if chi2_stat / dof < 10:
@@ -141,4 +208,4 @@ def pqm_chi2(
         else:
             chi2_stat = chi2_stat * (num_refs - 1) / dof
         dof = num_refs - 1
-    return chi2_stat
\ No newline at end of file
+    return chi2_stat

From a2c49572681cc048cec6145952d1d52e1d3c2019 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Tue, 13 Aug 2024 10:43:24 -0400
Subject: [PATCH 2/3] Can now add gaussian samples to the reference points

---
 src/pqm/pqm.py | 51 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index a1d9099..6b28719 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -13,6 +13,7 @@ def _pqm_test(
     num_refs: int,
     whiten: bool,
     x_frac: Optional[float] = None,
+    gauss_frac: Optional[float] = None,
 ):
     """
     Helper function to perform the PQM test and return the results from
@@ -35,6 +36,11 @@ def _pqm_test(
         y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
         (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
         None (default).
+    gauss_frac : float
+        Fraction of samples to take from gaussian distribution with mean/std
+        determined from the other reference samples. This ensures full support
+        of the reference samples if pathological behavior is expected.
+        Default: 0.0 no gaussian samples.
 
     Returns
     -------
@@ -59,9 +65,15 @@ def _pqm_test(
     if x_frac is None:
         x_frac = len(x_samples) / (len(x_samples) + len(y_samples))
 
+    # Determine fraction of samples to take from gaussian distribution
+    if gauss_frac is None:
+        gauss_frac = 0.0
+
     # Collect reference samples from x_samples
     if x_frac > 0:
-        xrefs = np.random.choice(len(x_samples), int(x_frac * num_refs), replace=False)
+        xrefs = np.random.choice(
+            len(x_samples), int(x_frac * (1.0 - gauss_frac) * num_refs), replace=False
+        )
         N = np.arange(len(x_samples))
         N[xrefs] = -1
         N = N[N >= 0]
@@ -71,7 +83,9 @@ def _pqm_test(
 
     # Collect reference samples from y_samples
     if x_frac < 1:
-        yrefs = np.random.choice(len(y_samples), int((1.0 - x_frac) * num_refs), replace=False)
+        yrefs = np.random.choice(
+            len(y_samples), int((1.0 - x_frac) * (1.0 - gauss_frac) * num_refs), replace=False
+        )
         N = np.arange(len(y_samples))
         N[yrefs] = -1
         N = N[N >= 0]
@@ -82,6 +96,15 @@ def _pqm_test(
     # Join the full set of reference samples
     refs = np.concatenate([xrefs, yrefs], axis=0)
 
+    # get gaussian reference points if requested
+    if gauss_frac > 0:
+        gauss_refs = np.random.normal(
+            loc=np.mean(refs, axis=0),
+            scale=np.std(refs, axis=0),
+            size=(int(gauss_frac * num_refs), *refs.shape[1:]),
+        )
+        refs = np.concatenate([refs, gauss_refs], axis=0)
+
     # Build KDtree to measure distances
     tree = KDTree(refs)
 
@@ -105,6 +128,7 @@ def pqm_pvalue(
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
     x_frac: Optional[float] = None,
+    gauss_frac: Optional[float] = None,
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -144,10 +168,17 @@ def pqm_pvalue(
     """
     if re_tessellation is not None:
         return [
-            pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac)
+            pqm_pvalue(
+                x_samples,
+                y_samples,
+                num_refs=num_refs,
+                whiten=whiten,
+                x_frac=x_frac,
+                gauss_frac=gauss_frac,
+            )
             for _ in range(re_tessellation)
         ]
-    _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac)
+    _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac)
     return pvalue
 
 
@@ -158,6 +189,7 @@ def pqm_chi2(
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
     x_frac: Optional[float] = None,
+    gauss_frac: Optional[float] = None,
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -196,10 +228,17 @@ def pqm_chi2(
     """
     if re_tessellation is not None:
         return [
-            pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac)
+            pqm_chi2(
+                x_samples,
+                y_samples,
+                num_refs=num_refs,
+                whiten=whiten,
+                x_frac=x_frac,
+                gauss_frac=gauss_frac,
+            )
             for _ in range(re_tessellation)
         ]
-    chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac)
+    chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac)
     if dof != num_refs - 1:
         # Rescale chi2 to new value which has the same cumulative probability
         if chi2_stat / dof < 10:

From 9db7a7a2c35981a10e439769dbd47addcde05a88 Mon Sep 17 00:00:00 2001
From: Connor Stone <connorstone628@gmail.com>
Date: Tue, 13 Aug 2024 10:52:00 -0400
Subject: [PATCH 3/3] tweaks and cleanup

---
 src/pqm/pqm.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py
index 6b28719..8ef12bd 100644
--- a/src/pqm/pqm.py
+++ b/src/pqm/pqm.py
@@ -13,7 +13,7 @@ def _pqm_test(
     num_refs: int,
     whiten: bool,
     x_frac: Optional[float] = None,
-    gauss_frac: Optional[float] = None,
+    gauss_frac: float = 0.0,
 ):
     """
     Helper function to perform the PQM test and return the results from
@@ -65,10 +65,6 @@ def _pqm_test(
     if x_frac is None:
         x_frac = len(x_samples) / (len(x_samples) + len(y_samples))
 
-    # Determine fraction of samples to take from gaussian distribution
-    if gauss_frac is None:
-        gauss_frac = 0.0
-
     # Collect reference samples from x_samples
     if x_frac > 0:
         xrefs = np.random.choice(
@@ -128,7 +124,7 @@ def pqm_pvalue(
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
     x_frac: Optional[float] = None,
-    gauss_frac: Optional[float] = None,
+    gauss_frac: float = 0.0,
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -159,6 +155,11 @@ def pqm_pvalue(
         y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
         (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
         None (default).
+    gauss_frac : float
+        Fraction of samples to take from gaussian distribution with mean/std
+        determined from the other reference samples. This ensures full support
+        of the reference samples if pathological behavior is expected.
+        Default: 0.0 no gaussian samples
 
     Returns
     -------
@@ -189,7 +190,7 @@ def pqm_chi2(
     re_tessellation: Optional[int] = None,
     whiten: bool = False,
     x_frac: Optional[float] = None,
-    gauss_frac: Optional[float] = None,
+    gauss_frac: float = 0.0,
 ):
     """
     Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
@@ -220,6 +221,11 @@ def pqm_chi2(
         y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
         (len(x_samples) + len(y_samples))`` which is what is done for x_frac =
         None (default).
+    gauss_frac : float
+        Fraction of samples to take from gaussian distribution with mean/std
+        determined from the other reference samples. This ensures full support
+        of the reference samples if pathological behavior is expected.
+        Default: 0.0 no gaussian samples
 
     Returns
     -------