From 8930f1080220c1fdeca4e754c7c530bad5893928 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Fri, 2 Aug 2024 10:37:54 -0400 Subject: [PATCH 1/3] feat: reference points can now be drawn from either x/y sample --- src/pqm/pqm.py | 121 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 27 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index ed1601f..a1d9099 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -7,9 +7,16 @@ __all__ = ("pqm_chi2", "pqm_pvalue") -def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, whiten: bool): +def _pqm_test( + x_samples: np.ndarray, + y_samples: np.ndarray, + num_refs: int, + whiten: bool, + x_frac: Optional[float] = None, +): """ - Helper function to perform the PQM test and return the results from chi2_contingency. + Helper function to perform the PQM test and return the results from + chi2_contingency. Parameters ---------- @@ -20,7 +27,14 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white num_refs : int Number of reference samples to use. whiten : bool - If True, whiten the samples by subtracting the mean and dividing by the standard deviation. + If True, whiten the samples by subtracting the mean and dividing by the + standard deviation. + x_frac : float + Fraction of x_samples to use as reference samples. ``x_frac = 1`` will + use only x_samples as reference samples, ``x_frac = 0`` will use only + y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / + (len(x_samples) + len(y_samples))`` which is what is done for x_frac = + None (default). Returns ------- @@ -41,12 +55,34 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white y_samples = (y_samples - mean) / std x_samples = (x_samples - mean) / std - refs = np.random.choice(len(y_samples), num_refs, replace=False) - N = np.arange(len(y_samples)) - N[refs] = -1 - N = N[N >= 0] - refs, y_samples = y_samples[refs], y_samples[N] - + # Determine fraction of x_samples to use as reference samples + if x_frac is None: + x_frac = len(x_samples) / (len(x_samples) + len(y_samples)) + + # Collect reference samples from x_samples + if x_frac > 0: + xrefs = np.random.choice(len(x_samples), int(x_frac * num_refs), replace=False) + N = np.arange(len(x_samples)) + N[xrefs] = -1 + N = N[N >= 0] + xrefs, x_samples = x_samples[xrefs], x_samples[N] + else: + xrefs = np.zeros((0,) + x_samples.shape[1:]) + + # Collect reference samples from y_samples + if x_frac < 1: + yrefs = np.random.choice(len(y_samples), int((1.0 - x_frac) * num_refs), replace=False) + N = np.arange(len(y_samples)) + N[yrefs] = -1 + N = N[N >= 0] + yrefs, y_samples = y_samples[yrefs], y_samples[N] + else: + yrefs = np.zeros((0,) + y_samples.shape[1:]) + + # Join the full set of reference samples + refs = np.concatenate([xrefs, yrefs], axis=0) + + # Build KDtree to measure distances tree = KDTree(refs) idx = tree.query(x_samples, k=1, workers=-1)[1] @@ -68,34 +104,50 @@ def pqm_pvalue( num_refs: int = 100, re_tessellation: Optional[int] = None, whiten: bool = False, + x_frac: Optional[float] = None, ): """ - Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. + Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` + are drawn form the same distribution. Parameters ---------- x_samples : np.ndarray - Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples. + Samples from the first distribution, test samples. Must have shape (N, + *D) N is the number of x samples, and D is the dimensionality of the + samples. y_samples : np.ndarray - Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples. + Samples from the second distribution, reference samples. Must have shape + (M, *D) M is the number of y samples, and D is the dimensionality of the + samples. num_refs : int - Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array. + Number of reference samples to use. Note that these will be drawn from + y_samples, and then removed from the y_samples array. re_tessellation : Optional[int] - Number of times pqm_pvalue is called, re tesselating the space. No re_tessellation if None (default). + Number of times pqm_pvalue is called, re-tesselating the space. No + re_tessellation if None (default). whiten : bool - If True, whiten the samples by subtracting the mean and dividing by the standard deviation. + If True, whiten the samples by subtracting the mean and dividing by the + standard deviation. + x_frac : float + Fraction of x_samples to use as reference samples. ``x_frac = 1`` will + use only x_samples as reference samples, ``x_frac = 0`` will use only + y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / + (len(x_samples) + len(y_samples))`` which is what is done for x_frac = + None (default). Returns ------- float or list - pvalue(s). Null hypothesis that both samples are drawn from the same distribution. + pvalue(s). Null hypothesis that both samples are drawn from the same + distribution. """ if re_tessellation is not None: return [ - pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten) + pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac) for _ in range(re_tessellation) ] - _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten) + _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac) return pvalue @@ -105,22 +157,37 @@ def pqm_chi2( num_refs: int = 100, re_tessellation: Optional[int] = None, whiten: bool = False, + x_frac: Optional[float] = None, ): """ - Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution. + Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` + are drawn form the same distribution. Parameters ---------- x_samples : np.ndarray - Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples. + Samples from the first distribution, test samples. Must have shape (N, + *D) N is the number of x samples, and D is the dimensionality of the + samples. y_samples : np.ndarray - Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples. + Samples from the second distribution, reference samples. Must have shape + (M, *D) M is the number of y samples, and D is the dimensionality of the + samples. num_refs : int - Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array. + Number of reference samples to use. Note that these will be drawn from + y_samples, and then removed from the y_samples array. re_tessellation : Optional[int] - Number of times pqm_chi2 is called, re tesselating the space. No re_tessellation if None (default). + Number of times pqm_chi2 is called, re-tesselating the space. No + re_tessellation if None (default). whiten : bool - If True, whiten the samples by subtracting the mean and dividing by the standard deviation. + If True, whiten the samples by subtracting the mean and dividing by the + standard deviation. + x_frac : float + Fraction of x_samples to use as reference samples. ``x_frac = 1`` will + use only x_samples as reference samples, ``x_frac = 0`` will use only + y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / + (len(x_samples) + len(y_samples))`` which is what is done for x_frac = + None (default). Returns ------- @@ -129,10 +196,10 @@ def pqm_chi2( """ if re_tessellation is not None: return [ - pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten) + pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac) for _ in range(re_tessellation) ] - chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten) + chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac) if dof != num_refs - 1: # Rescale chi2 to new value which has the same cumulative probability if chi2_stat / dof < 10: @@ -141,4 +208,4 @@ def pqm_chi2( else: chi2_stat = chi2_stat * (num_refs - 1) / dof dof = num_refs - 1 - return chi2_stat \ No newline at end of file + return chi2_stat From a2c49572681cc048cec6145952d1d52e1d3c2019 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Tue, 13 Aug 2024 10:43:24 -0400 Subject: [PATCH 2/3] Can now add gaussian samples to the reference points --- src/pqm/pqm.py | 51 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index a1d9099..6b28719 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -13,6 +13,7 @@ def _pqm_test( num_refs: int, whiten: bool, x_frac: Optional[float] = None, + gauss_frac: Optional[float] = None, ): """ Helper function to perform the PQM test and return the results from @@ -35,6 +36,11 @@ def _pqm_test( y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / (len(x_samples) + len(y_samples))`` which is what is done for x_frac = None (default). + gauss_frac : float + Fraction of samples to take from gaussian distribution with mean/std + determined from the other reference samples. This ensures full support + of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples. Returns ------- @@ -59,9 +65,15 @@ def _pqm_test( if x_frac is None: x_frac = len(x_samples) / (len(x_samples) + len(y_samples)) + # Determine fraction of samples to take from gaussian distribution + if gauss_frac is None: + gauss_frac = 0.0 + # Collect reference samples from x_samples if x_frac > 0: - xrefs = np.random.choice(len(x_samples), int(x_frac * num_refs), replace=False) + xrefs = np.random.choice( + len(x_samples), int(x_frac * (1.0 - gauss_frac) * num_refs), replace=False + ) N = np.arange(len(x_samples)) N[xrefs] = -1 N = N[N >= 0] @@ -71,7 +83,9 @@ def _pqm_test( # Collect reference samples from y_samples if x_frac < 1: - yrefs = np.random.choice(len(y_samples), int((1.0 - x_frac) * num_refs), replace=False) + yrefs = np.random.choice( + len(y_samples), int((1.0 - x_frac) * (1.0 - gauss_frac) * num_refs), replace=False + ) N = np.arange(len(y_samples)) N[yrefs] = -1 N = N[N >= 0] @@ -82,6 +96,15 @@ def _pqm_test( # Join the full set of reference samples refs = np.concatenate([xrefs, yrefs], axis=0) + # get gaussian reference points if requested + if gauss_frac > 0: + gauss_refs = np.random.normal( + loc=np.mean(refs, axis=0), + scale=np.std(refs, axis=0), + size=(int(gauss_frac * num_refs), *refs.shape[1:]), + ) + refs = np.concatenate([refs, gauss_refs], axis=0) + # Build KDtree to measure distances tree = KDTree(refs) @@ -105,6 +128,7 @@ def pqm_pvalue( re_tessellation: Optional[int] = None, whiten: bool = False, x_frac: Optional[float] = None, + gauss_frac: Optional[float] = None, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -144,10 +168,17 @@ def pqm_pvalue( """ if re_tessellation is not None: return [ - pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac) + pqm_pvalue( + x_samples, + y_samples, + num_refs=num_refs, + whiten=whiten, + x_frac=x_frac, + gauss_frac=gauss_frac, + ) for _ in range(re_tessellation) ] - _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac) + _, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac) return pvalue @@ -158,6 +189,7 @@ def pqm_chi2( re_tessellation: Optional[int] = None, whiten: bool = False, x_frac: Optional[float] = None, + gauss_frac: Optional[float] = None, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -196,10 +228,17 @@ def pqm_chi2( """ if re_tessellation is not None: return [ - pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten, x_frac=x_frac) + pqm_chi2( + x_samples, + y_samples, + num_refs=num_refs, + whiten=whiten, + x_frac=x_frac, + gauss_frac=gauss_frac, + ) for _ in range(re_tessellation) ] - chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac) + chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac) if dof != num_refs - 1: # Rescale chi2 to new value which has the same cumulative probability if chi2_stat / dof < 10: From 9db7a7a2c35981a10e439769dbd47addcde05a88 Mon Sep 17 00:00:00 2001 From: Connor Stone Date: Tue, 13 Aug 2024 10:52:00 -0400 Subject: [PATCH 3/3] tweaks and cleanup --- src/pqm/pqm.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index 6b28719..8ef12bd 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -13,7 +13,7 @@ def _pqm_test( num_refs: int, whiten: bool, x_frac: Optional[float] = None, - gauss_frac: Optional[float] = None, + gauss_frac: float = 0.0, ): """ Helper function to perform the PQM test and return the results from @@ -65,10 +65,6 @@ def _pqm_test( if x_frac is None: x_frac = len(x_samples) / (len(x_samples) + len(y_samples)) - # Determine fraction of samples to take from gaussian distribution - if gauss_frac is None: - gauss_frac = 0.0 - # Collect reference samples from x_samples if x_frac > 0: xrefs = np.random.choice( @@ -128,7 +124,7 @@ def pqm_pvalue( re_tessellation: Optional[int] = None, whiten: bool = False, x_frac: Optional[float] = None, - gauss_frac: Optional[float] = None, + gauss_frac: float = 0.0, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -159,6 +155,11 @@ def pqm_pvalue( y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / (len(x_samples) + len(y_samples))`` which is what is done for x_frac = None (default). + gauss_frac : float + Fraction of samples to take from gaussian distribution with mean/std + determined from the other reference samples. This ensures full support + of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples Returns ------- @@ -189,7 +190,7 @@ def pqm_chi2( re_tessellation: Optional[int] = None, whiten: bool = False, x_frac: Optional[float] = None, - gauss_frac: Optional[float] = None, + gauss_frac: float = 0.0, ): """ Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` @@ -220,6 +221,11 @@ def pqm_chi2( y_samples as reference samples. Ideally, ``x_frac = len(x_samples) / (len(x_samples) + len(y_samples))`` which is what is done for x_frac = None (default). + gauss_frac : float + Fraction of samples to take from gaussian distribution with mean/std + determined from the other reference samples. This ensures full support + of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples Returns -------