Skip to content

Commit

Permalink
Merge pull request #14 from Ciela-Institute/fullcoverage
Browse files Browse the repository at this point in the history
Allow a fraction of samples to come from a gaussian distribution allowing for full coverage and certain statistical guarantees
  • Loading branch information
SammyS15 authored Aug 13, 2024
2 parents 9da4b31 + 9db7a7a commit 33f2c39
Showing 1 changed file with 138 additions and 26 deletions.
164 changes: 138 additions & 26 deletions src/pqm/pqm.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,17 @@
__all__ = ("pqm_chi2", "pqm_pvalue")


def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, whiten: bool):
def _pqm_test(
x_samples: np.ndarray,
y_samples: np.ndarray,
num_refs: int,
whiten: bool,
x_frac: Optional[float] = None,
gauss_frac: float = 0.0,
):
"""
Helper function to perform the PQM test and return the results from chi2_contingency.
Helper function to perform the PQM test and return the results from
chi2_contingency.
Parameters
----------
Expand All @@ -20,7 +28,19 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white
num_refs : int
Number of reference samples to use.
whiten : bool
If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
If True, whiten the samples by subtracting the mean and dividing by the
standard deviation.
x_frac : float
Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
use only x_samples as reference samples, ``x_frac = 0`` will use only
y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
(len(x_samples) + len(y_samples))`` which is what is done for x_frac =
None (default).
gauss_frac : float
Fraction of samples to take from gaussian distribution with mean/std
determined from the other reference samples. This ensures full support
of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples.
Returns
-------
Expand All @@ -41,12 +61,47 @@ def _pqm_test(x_samples: np.ndarray, y_samples: np.ndarray, num_refs: int, white
y_samples = (y_samples - mean) / std
x_samples = (x_samples - mean) / std

refs = np.random.choice(len(y_samples), num_refs, replace=False)
N = np.arange(len(y_samples))
N[refs] = -1
N = N[N >= 0]
refs, y_samples = y_samples[refs], y_samples[N]
# Determine fraction of x_samples to use as reference samples
if x_frac is None:
x_frac = len(x_samples) / (len(x_samples) + len(y_samples))

# Collect reference samples from x_samples
if x_frac > 0:
xrefs = np.random.choice(
len(x_samples), int(x_frac * (1.0 - gauss_frac) * num_refs), replace=False
)
N = np.arange(len(x_samples))
N[xrefs] = -1
N = N[N >= 0]
xrefs, x_samples = x_samples[xrefs], x_samples[N]
else:
xrefs = np.zeros((0,) + x_samples.shape[1:])

# Collect reference samples from y_samples
if x_frac < 1:
yrefs = np.random.choice(
len(y_samples), int((1.0 - x_frac) * (1.0 - gauss_frac) * num_refs), replace=False
)
N = np.arange(len(y_samples))
N[yrefs] = -1
N = N[N >= 0]
yrefs, y_samples = y_samples[yrefs], y_samples[N]
else:
yrefs = np.zeros((0,) + y_samples.shape[1:])

# Join the full set of reference samples
refs = np.concatenate([xrefs, yrefs], axis=0)

# get gaussian reference points if requested
if gauss_frac > 0:
gauss_refs = np.random.normal(
loc=np.mean(refs, axis=0),
scale=np.std(refs, axis=0),
size=(int(gauss_frac * num_refs), *refs.shape[1:]),
)
refs = np.concatenate([refs, gauss_refs], axis=0)

# Build KDtree to measure distances
tree = KDTree(refs)

idx = tree.query(x_samples, k=1, workers=-1)[1]
Expand All @@ -68,34 +123,63 @@ def pqm_pvalue(
num_refs: int = 100,
re_tessellation: Optional[int] = None,
whiten: bool = False,
x_frac: Optional[float] = None,
gauss_frac: float = 0.0,
):
"""
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
are drawn form the same distribution.
Parameters
----------
x_samples : np.ndarray
Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples.
Samples from the first distribution, test samples. Must have shape (N,
*D) N is the number of x samples, and D is the dimensionality of the
samples.
y_samples : np.ndarray
Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
Samples from the second distribution, reference samples. Must have shape
(M, *D) M is the number of y samples, and D is the dimensionality of the
samples.
num_refs : int
Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
Number of reference samples to use. Note that these will be drawn from
y_samples, and then removed from the y_samples array.
re_tessellation : Optional[int]
Number of times pqm_pvalue is called, re tesselating the space. No re_tessellation if None (default).
Number of times pqm_pvalue is called, re-tesselating the space. No
re_tessellation if None (default).
whiten : bool
If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
If True, whiten the samples by subtracting the mean and dividing by the
standard deviation.
x_frac : float
Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
use only x_samples as reference samples, ``x_frac = 0`` will use only
y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
(len(x_samples) + len(y_samples))`` which is what is done for x_frac =
None (default).
gauss_frac : float
Fraction of samples to take from gaussian distribution with mean/std
determined from the other reference samples. This ensures full support
of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples
Returns
-------
float or list
pvalue(s). Null hypothesis that both samples are drawn from the same distribution.
pvalue(s). Null hypothesis that both samples are drawn from the same
distribution.
"""
if re_tessellation is not None:
return [
pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
pqm_pvalue(
x_samples,
y_samples,
num_refs=num_refs,
whiten=whiten,
x_frac=x_frac,
gauss_frac=gauss_frac,
)
for _ in range(re_tessellation)
]
_, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
_, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac)
return pvalue


Expand All @@ -105,22 +189,43 @@ def pqm_chi2(
num_refs: int = 100,
re_tessellation: Optional[int] = None,
whiten: bool = False,
x_frac: Optional[float] = None,
gauss_frac: float = 0.0,
):
"""
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples` are drawn form the same distribution.
Perform the PQM test of the null hypothesis that `x_samples` and `y_samples`
are drawn form the same distribution.
Parameters
----------
x_samples : np.ndarray
Samples from the first distribution, test samples. Must have shape (N, *D) N is the number of x samples, and D is the dimensionality of the samples.
Samples from the first distribution, test samples. Must have shape (N,
*D) N is the number of x samples, and D is the dimensionality of the
samples.
y_samples : np.ndarray
Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
Samples from the second distribution, reference samples. Must have shape
(M, *D) M is the number of y samples, and D is the dimensionality of the
samples.
num_refs : int
Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
Number of reference samples to use. Note that these will be drawn from
y_samples, and then removed from the y_samples array.
re_tessellation : Optional[int]
Number of times pqm_chi2 is called, re tesselating the space. No re_tessellation if None (default).
Number of times pqm_chi2 is called, re-tesselating the space. No
re_tessellation if None (default).
whiten : bool
If True, whiten the samples by subtracting the mean and dividing by the standard deviation.
If True, whiten the samples by subtracting the mean and dividing by the
standard deviation.
x_frac : float
Fraction of x_samples to use as reference samples. ``x_frac = 1`` will
use only x_samples as reference samples, ``x_frac = 0`` will use only
y_samples as reference samples. Ideally, ``x_frac = len(x_samples) /
(len(x_samples) + len(y_samples))`` which is what is done for x_frac =
None (default).
gauss_frac : float
Fraction of samples to take from gaussian distribution with mean/std
determined from the other reference samples. This ensures full support
of the reference samples if pathological behavior is expected.
Default: 0.0 no gaussian samples
Returns
-------
Expand All @@ -129,10 +234,17 @@ def pqm_chi2(
"""
if re_tessellation is not None:
return [
pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
pqm_chi2(
x_samples,
y_samples,
num_refs=num_refs,
whiten=whiten,
x_frac=x_frac,
gauss_frac=gauss_frac,
)
for _ in range(re_tessellation)
]
chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac)
if dof != num_refs - 1:
# Rescale chi2 to new value which has the same cumulative probability
if chi2_stat / dof < 10:
Expand All @@ -141,4 +253,4 @@ def pqm_chi2(
else:
chi2_stat = chi2_stat * (num_refs - 1) / dof
dof = num_refs - 1
return chi2_stat
return chi2_stat

0 comments on commit 33f2c39

Please sign in to comment.