Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sammy dev #9

Merged
merged 10 commits into from
Jul 19, 2024
10 changes: 3 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,19 @@ pip install pqm
This is the main use case:

```python
from pqm import pqm_pvalue
from pqm import pqm_pvalue, pqm_chi2
import numpy as np

x_sample = np.random.normal(size = (500, 10))
y_sample = np.random.normal(size = (400, 10))

# To get pvalues from PQMass
pvalues = pqm_pvalue(x_sample, y_sample, num_refs = 100, bootstrap = 50)
pvalues = pqm_pvalue(x_sample, y_sample, num_refs = 100, re_tessellation = 50)
print(np.mean(pvalues), np.std(pvalues))

# To get chi^2 from PQMass
chi2_stat, dof = pqm_chi2(x_sample, y_sample, num_refs = 100, bootstrap = 50)
chi2_stat = pqm_chi2(x_sample, y_sample, num_refs = 100, re_tessellation = 50)
print(np.mean(chi2_stat), np.std(chi2_stat))
print(np.unqiue(dof)) # This should be the same as num_refs - 1, if it is not, we suggest you use pqm_pvalue
```

If your two samples are drawn from the same distribution, then the p-value should
ConnorStoneAstro marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -44,9 +43,6 @@ it suggests that the samples are out of distribution. Conversely, if the histogr
to the left, it indicates potential duplication or memorization (particularly relevant
for generative models).

Note that the chi^2 metric faces limitations if you have a few samples. A solution could
be to use bootstrapping. Another such solution is to pqm_pvalue. We leave it to the user to
identify the best solution for their problem.

## Developing

Expand Down
148 changes: 111 additions & 37 deletions notebooks/mnist.ipynb

Large diffs are not rendered by default.

99 changes: 53 additions & 46 deletions notebooks/test.ipynb

Large diffs are not rendered by default.

114 changes: 30 additions & 84 deletions notebooks/time_series.ipynb

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions src/pqm/pqm.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def pqm_pvalue(
x_samples: np.ndarray,
y_samples: np.ndarray,
num_refs: int = 100,
bootstrap: Optional[int] = None,
re_tessellation: Optional[int] = None,
whiten: bool = False,
):
"""
Expand All @@ -80,8 +80,8 @@ def pqm_pvalue(
Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
num_refs : int
Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
bootstrap : Optional[int]
Number of bootstrap iterations to perform. No bootstrap if None (default).
re_tessellation : Optional[int]
Number of times pqm_pvalue is called, re tesselating the space. No re_tessellation if None (default).
whiten : bool
If True, whiten the samples by subtracting the mean and dividing by the standard deviation.

Expand All @@ -90,10 +90,10 @@ def pqm_pvalue(
float or list
pvalue(s). Null hypothesis that both samples are drawn from the same distribution.
"""
if bootstrap is not None:
if re_tessellation is not None:
return [
pqm_pvalue(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
for _ in range(bootstrap)
for _ in range(re_tessellation)
]
_, pvalue, _, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
return pvalue
Expand All @@ -103,7 +103,7 @@ def pqm_chi2(
x_samples: np.ndarray,
y_samples: np.ndarray,
num_refs: int = 100,
bootstrap: Optional[int] = None,
re_tessellation: Optional[int] = None,
whiten: bool = False,
):
"""
Expand All @@ -117,8 +117,8 @@ def pqm_chi2(
Samples from the second distribution, reference samples. Must have shape (M, *D) M is the number of y samples, and D is the dimensionality of the samples.
num_refs : int
Number of reference samples to use. Note that these will be drawn from y_samples, and then removed from the y_samples array.
bootstrap : Optional[int]
Number of bootstrap iterations to perform. No bootstrap if None (default).
re_tessellation : Optional[int]
Number of times pqm_chi2 is called, re tesselating the space. No re_tessellation if None (default).
whiten : bool
If True, whiten the samples by subtracting the mean and dividing by the standard deviation.

Expand All @@ -127,10 +127,10 @@ def pqm_chi2(
float or list
chi2 statistic(s) and degree(s) of freedom.
"""
if bootstrap is not None:
if re_tessellation is not None:
return [
pqm_chi2(x_samples, y_samples, num_refs=num_refs, whiten=whiten)
for _ in range(bootstrap)
for _ in range(re_tessellation)
]
chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten)
if dof != num_refs - 1:
Expand All @@ -141,4 +141,4 @@ def pqm_chi2(
else:
chi2_stat = chi2_stat * (num_refs - 1) / dof
dof = num_refs - 1
return chi2_stat, dof
return chi2_stat
9 changes: 5 additions & 4 deletions tests/test_gaussian.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def test_pass_chi2():

new.append(pqm_chi2(x_samples, y_samples, num_refs=100))
new = np.array(new)
assert np.abs(np.mean(new[:, 0]) / 99 - 1) < 0.15
print("np.abs(np.mean(new) / 99 - 1) < 0.15")
assert np.abs(np.mean(new) / 99 - 1) < 0.15


def test_fail_pvalue():
Expand All @@ -35,12 +36,12 @@ def test_fail_pvalue():
assert np.mean(new) < 1e-3


def test_fail_chi2():
def test_fail_chi2(num_refs = 50):
new = []
for _ in range(100):
y_samples = np.random.normal(size=(500, 50))
x_samples = np.random.normal(size=(250, 50)) + 0.5

new.append(pqm_chi2(x_samples, y_samples, num_refs=100))
new.append(pqm_chi2(x_samples, y_samples, num_refs=num_refs))
new = np.array(new)
assert np.mean(new[:, 0]) / 99 > 2
assert np.mean(new) / num_refs-1 > 2
Loading