diff --git a/README.md b/README.md index 5e89055..20dd82f 100644 --- a/README.md +++ b/README.md @@ -36,18 +36,22 @@ chi2_stat = pqm_chi2(x_sample, y_sample, num_refs = 100, re_tessellation = 50) print(np.mean(chi2_stat), np.std(chi2_stat)) ``` -If your two samples are drawn from the same distribution, then the p-value should -be drawn from the random uniform(0,1) distribution. This means that if you get a -very small value (i.e., 1e-6), then you have failed the null hypothesis test, and -the two samples are not drawn from the same distribution. +If your two samples are drawn from the same distribution, then the p-value +should be drawn from the random uniform(0,1) distribution. This means that if +you get a very small value (i.e., 1e-6), then you have failed the null +hypothesis test, and the two samples are not drawn from the same distribution. +If you get values approximately equal to 1 every time then that suggests +potential duplication of samples between `x_samples` and `y_samples`. For the chi^2 metric, given your two sets of samples, if they come from the same -distribution, the histogram of your chi² values should follow the chi² distribution. -The peak of this distribution will be at DoF - 2, and the standard deviation will -be √(2 * DoF). If your histogram shifts to the right of the expected chi² distribution, -it suggests that the samples are out of distribution. Conversely, if the histogram shifts -to the left, it indicates potential duplication or memorization (particularly relevant -for generative models). +distribution, the histogram of your chi^2 values should follow the chi^2 +distribution. The degrees of freedom (DoF) will equal `DoF = num_refs - 1` The +peak of this distribution will be at `DoF - 2`, the mean will equal `DoF`, and +the standard deviation will be `sqrt(2 * DoF)`. If your chi^2 values are too +high (`chi^2 / DoF > 1`), it suggests that the samples are out of distribution. +Conversely, if the values are too low (`chi^2 / DoF < 1`), it indicates +potential duplication of samples between `x_samples` and `y_samples` (i.e. +memorization for generative models). ## Developing @@ -60,3 +64,5 @@ cd PQM git checkout -b my-new-branch pip install -e . ``` + +But make an issue first so we can discuss implementation ideas. \ No newline at end of file diff --git a/src/pqm/pqm.py b/src/pqm/pqm.py index c016a47..7cca092 100644 --- a/src/pqm/pqm.py +++ b/src/pqm/pqm.py @@ -8,6 +8,21 @@ __all__ = ("pqm_chi2", "pqm_pvalue") +def _mean_std(sample1, sample2, axis=0): + """Get the mean and std of two combined samples, without actually combining them, to save memory.""" + n1, *_ = sample1.shape + n2, *_ = sample2.shape + # Get mean/std of combined sample + mx, sx = np.mean(sample1, axis=axis), np.std(sample1, axis=axis) + my, sy = np.mean(sample2, axis=axis), np.std(sample2, axis=axis) + m = (n1 * mx + n2 * my) / (n1 + n2) + s = np.sqrt( + ((n1 - 1) * (sx**2) + (n2 - 1) * (sy**2) + n1 * n2 * (mx - my) ** 2 / (n1 + n2)) + / (n1 + n2 - 1) + ) + return m, s + + def _pqm_test( x_samples: np.ndarray, y_samples: np.ndarray, @@ -39,86 +54,71 @@ def _pqm_test( None (default). gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std - determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. Default: - 0.0 no gaussian samples. + determined from the combined x_samples/y_samples. This ensures full + support of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples. Note ---- When using ``x_frac`` and ``gauss_frac``, note that the number of - reference samples from the x_samples, y_samples, and gaussian + reference samples from the x_samples, y_samples, and Gaussian distribution will be determined by a multinomial distribution. This means that the actual number of reference samples from each distribution may not be exactly equal to the requested fractions, but will on average - equal those numbers. For best results, we suggest using a large number - of re-tessellations, though this is our recommendation in any case. + equal those numbers. The mean relative number of reference samples drawn + from x_samples, y_samples, and Gaussian is ``Nx=x_frac*(1-gauss_frac)``, + ``Ny=(1-x_frac)*(1-gauss_frac)``, and ``Ng=gauss_frac`` respectively. + For best results, we suggest using a large number of re-tessellations, + though this is our recommendation in any case. Returns ------- tuple Results from scipy.stats.chi2_contingency function. """ - if len(y_samples) < num_refs: + nx, *D = x_samples.shape + ny, *D = y_samples.shape + if (nx + ny) <= num_refs + 2: raise ValueError( - "Number of reference samples must be less than the number of true samples." + "Number of reference samples (num_ref) must be less than the number of x/y samples. Ideally much less." ) - elif len(y_samples) < 2 * num_refs: - print( - "Warning: Number of y_samples is small (less than twice the number of reference samples). Result may have high variance." + elif (nx + ny) < 2 * num_refs: + warnings.warn( + "Number of samples is small (less than twice the number of reference samples). Result will have high variance and/or be non-discriminating." ) if whiten: - mean = np.mean(y_samples, axis=0) - std = np.std(y_samples, axis=0) + mean, std = _mean_std(x_samples, y_samples) y_samples = (y_samples - mean) / std x_samples = (x_samples - mean) / std # Determine fraction of x_samples to use as reference samples - if x_frac is None: - x_frac = len(x_samples) / (len(x_samples) + len(y_samples)) + x_frac = nx / (nx + ny) if x_frac is None else x_frac # Determine number of samples from each distribution (x_samples, y_samples, gaussian) Nx, Ny, Ng = np.random.multinomial( num_refs, [x_frac * (1.0 - gauss_frac), (1.0 - x_frac) * (1.0 - gauss_frac), gauss_frac], ) + assert (Nx + Ny + Ng) == num_refs, f"Something went wrong. Nx={Nx}, Ny={Ny}, Ng={Ng} should sum to num_refs={num_refs}" # fmt: skip # Collect reference samples from x_samples - if Nx > 0: - xrefs = np.random.choice(len(x_samples), Nx, replace=False) - N = np.arange(len(x_samples)) - N[xrefs] = -1 - N = N[N >= 0] - xrefs, x_samples = x_samples[xrefs], x_samples[N] - else: - xrefs = np.zeros((0,) + x_samples.shape[1:]) + xrefs = np.random.choice(nx, Nx, replace=False) + xrefs, x_samples = x_samples[xrefs], np.delete(x_samples, xrefs, axis=0) # Collect reference samples from y_samples - if Ny > 0: - yrefs = np.random.choice(len(y_samples), Ny, replace=False) - N = np.arange(len(y_samples)) - N[yrefs] = -1 - N = N[N >= 0] - yrefs, y_samples = y_samples[yrefs], y_samples[N] - else: - yrefs = np.zeros((0,) + y_samples.shape[1:]) + yrefs = np.random.choice(ny, Ny, replace=False) + yrefs, y_samples = y_samples[yrefs], np.delete(y_samples, yrefs, axis=0) # Join the full set of reference samples refs = np.concatenate([xrefs, yrefs], axis=0) # get gaussian reference points if requested if Ng > 0: - if Nx + Ny > 2: - m, s = np.mean(refs, axis=0), np.std(refs, axis=0) - else: - warnings.warn( - f"Very low number of x/y reference samples used ({Nx+Ny}). Initializing gaussian from all y_samples. We suggest increasing num_refs or decreasing gauss_frac.", - UserWarning, - ) - m, s = np.mean(y_samples, axis=0), np.std(y_samples, axis=0) + m, s = _mean_std(x_samples, y_samples) gauss_refs = np.random.normal( loc=m, scale=s, - size=(Ng, *x_samples.shape[1:]), + size=(Ng, *D), ) refs = np.concatenate([refs, gauss_refs], axis=0) @@ -135,7 +135,27 @@ def _pqm_test( C = (counts_x > 0) | (counts_y > 0) counts_x, counts_y = counts_x[C], counts_y[C] - return chi2_contingency(np.array([counts_x, counts_y])) + n_filled_bins = np.sum(C) + if n_filled_bins == 1: + raise ValueError( + """ + Only one Voronoi cell has samples, so chi^2 cannot + be computed. This is likely due to a small number + of samples or a pathological distribution. If possible, + increase the number of x_samples and y_samples. + """ + ) + if n_filled_bins < (num_refs // 2): + warnings.warn( + """ + Less than half of the Voronoi cells have any samples in them. + Possibly due to a small number of samples or a pathological + distribution. Result may be unreliable. If possible, increase the + number of x_samples and y_samples. + """ + ) + + return chi2_contingency(np.stack([counts_x, counts_y])) def pqm_pvalue( @@ -155,22 +175,22 @@ def pqm_pvalue( Parameters ---------- x_samples : np.ndarray - Samples from the first distribution, test samples. Must have shape (N, - *D) N is the number of x samples, and D is the dimensionality of the - samples. + Samples from the first distribution. Must have shape (N, *D) N is the + number of x samples, and D is the dimensionality of the samples. y_samples : np.ndarray - Samples from the second distribution, reference samples. Must have shape - (M, *D) M is the number of y samples, and D is the dimensionality of the - samples. + Samples from the second distribution. Must have shape (M, *D) M is the + number of y samples, and D is the dimensionality of the samples. num_refs : int - Number of reference samples to use. Note that these will be drawn from - y_samples, and then removed from the y_samples array. + Number of reference samples to use. These samples will be drawn from + x_samples, y_samples, and/or a Gaussian distribution, see the note + below. re_tessellation : Optional[int] Number of times pqm_pvalue is called, re-tesselating the space. No re_tessellation if None (default). whiten : bool If True, whiten the samples by subtracting the mean and dividing by the - standard deviation. + standard deviation. mean and std are calculated from the combined + x_samples and y_samples. x_frac : float Fraction of x_samples to use as reference samples. ``x_frac = 1`` will use only x_samples as reference samples, ``x_frac = 0`` will use only @@ -179,19 +199,22 @@ def pqm_pvalue( None (default). gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std - determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. Default: - 0.0 no gaussian samples. + determined from the combined x_samples/y_samples. This ensures full + support of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples. Note ---- When using ``x_frac`` and ``gauss_frac``, note that the number of - reference samples from the x_samples, y_samples, and gaussian + reference samples from the x_samples, y_samples, and Gaussian distribution will be determined by a multinomial distribution. This means that the actual number of reference samples from each distribution may not be exactly equal to the requested fractions, but will on average - equal those numbers. For best results, we suggest using a large number - of re-tessellations, though this is our recommendation in any case. + equal those numbers. The mean relative number of reference samples drawn + from x_samples, y_samples, and Gaussian is ``Nx=x_frac*(1-gauss_frac)``, + ``Ny=(1-x_frac)*(1-gauss_frac)``, and ``Ng=gauss_frac`` respectively. + For best results, we suggest using a large number of re-tessellations, + though this is our recommendation in any case. Returns ------- @@ -232,22 +255,22 @@ def pqm_chi2( Parameters ---------- x_samples : np.ndarray - Samples from the first distribution, test samples. Must have shape (N, - *D) N is the number of x samples, and D is the dimensionality of the - samples. + Samples from the first distribution. Must have shape (N, *D) N is the + number of x samples, and D is the dimensionality of the samples. y_samples : np.ndarray - Samples from the second distribution, reference samples. Must have shape - (M, *D) M is the number of y samples, and D is the dimensionality of the - samples. + Samples from the second distribution. Must have shape (M, *D) M is the + number of y samples, and D is the dimensionality of the samples. num_refs : int - Number of reference samples to use. Note that these will be drawn from - y_samples, and then removed from the y_samples array. + Number of reference samples to use. These samples will be drawn from + x_samples, y_samples, and/or a Gaussian distribution, see the note + below. re_tessellation : Optional[int] - Number of times pqm_chi2 is called, re-tesselating the space. No + Number of times pqm_pvalue is called, re-tesselating the space. No re_tessellation if None (default). whiten : bool If True, whiten the samples by subtracting the mean and dividing by the - standard deviation. + standard deviation. mean and std are calculated from the combined + x_samples and y_samples. x_frac : float Fraction of x_samples to use as reference samples. ``x_frac = 1`` will use only x_samples as reference samples, ``x_frac = 0`` will use only @@ -256,19 +279,22 @@ def pqm_chi2( None (default). gauss_frac : float Fraction of samples to take from gaussian distribution with mean/std - determined from the other reference samples. This ensures full support - of the reference samples if pathological behavior is expected. Default: - 0.0 no gaussian samples. + determined from the combined x_samples/y_samples. This ensures full + support of the reference samples if pathological behavior is expected. + Default: 0.0 no gaussian samples. Note ---- When using ``x_frac`` and ``gauss_frac``, note that the number of - reference samples from the x_samples, y_samples, and gaussian + reference samples from the x_samples, y_samples, and Gaussian distribution will be determined by a multinomial distribution. This means that the actual number of reference samples from each distribution may not be exactly equal to the requested fractions, but will on average - equal those numbers. For best results, we suggest using a large number - of re-tessellations, though this is our recommendation in any case. + equal those numbers. The mean relative number of reference samples drawn + from x_samples, y_samples, and Gaussian is ``Nx=x_frac*(1-gauss_frac)``, + ``Ny=(1-x_frac)*(1-gauss_frac)``, and ``Ng=gauss_frac`` respectively. + For best results, we suggest using a large number of re-tessellations, + though this is our recommendation in any case. Note ---- @@ -299,6 +325,7 @@ def pqm_chi2( for _ in range(re_tessellation) ] chi2_stat, _, dof, _ = _pqm_test(x_samples, y_samples, num_refs, whiten, x_frac, gauss_frac) + if dof != num_refs - 1: # Rescale chi2 to new value which has the same cumulative probability if chi2_stat / dof < 10: diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..6c68ebd --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,20 @@ +import pytest +from pqm import pqm_pvalue, pqm_chi2 +import numpy as np + + +def test_num_refs(): + with pytest.raises(ValueError): + pqm_pvalue(np.random.normal(size=(10, 50)), np.random.normal(size=(15, 50)), num_refs=100) + with pytest.raises(ValueError): + pqm_chi2(np.random.normal(size=(15, 50)), np.random.normal(size=(10, 50)), num_refs=100) + + with pytest.warns(UserWarning): + pqm_pvalue( + np.random.normal(size=(100, 100)), np.random.normal(size=(110, 100)), num_refs=150 + ) + + +def test_filled_bins(): + with pytest.raises(ValueError): + pqm_pvalue(np.zeros(shape=(500, 50)), np.zeros(shape=(250, 50)), num_refs=10) diff --git a/tests/test_gaussian.py b/tests/test_gaussian.py index 1ce89d5..f4d011c 100644 --- a/tests/test_gaussian.py +++ b/tests/test_gaussian.py @@ -1,47 +1,71 @@ import numpy as np from pqm import pqm_pvalue, pqm_chi2 +import pytest -def test_pass_pvalue(): +@pytest.mark.parametrize("whiten", [True, False]) +@pytest.mark.parametrize("num_refs", [20, 100]) +@pytest.mark.parametrize("ndim", [1, 50]) +def test_pass_pvalue(whiten, num_refs, ndim): new = [] - for _ in range(100): - y_samples = np.random.normal(size=(500, 50)) - x_samples = np.random.normal(size=(250, 50)) + for _ in range(50): + y_samples = np.random.normal(size=(500, ndim)) + x_samples = np.random.normal(size=(250, ndim)) - new.append(pqm_pvalue(x_samples, y_samples)) + new.append(pqm_pvalue(x_samples, y_samples, whiten=whiten, num_refs=num_refs)) + # Check for roughly uniform distribution of p-values assert np.abs(np.mean(new) - 0.5) < 0.15 -def test_pass_chi2(): +@pytest.mark.parametrize("num_refs", [20, 100]) +@pytest.mark.parametrize("ndim", [1, 50]) +def test_pass_chi2(num_refs, ndim): new = [] - for _ in range(100): - y_samples = np.random.normal(size=(500, 50)) - x_samples = np.random.normal(size=(250, 50)) + for _ in range(50): + y_samples = np.random.normal(size=(500, ndim)) + x_samples = np.random.normal(size=(250, ndim)) - new.append(pqm_chi2(x_samples, y_samples, num_refs=100)) + new.append(pqm_chi2(x_samples, y_samples, num_refs=num_refs)) new = np.array(new) - print("np.abs(np.mean(new) / 99 - 1) < 0.15") - assert np.abs(np.mean(new) / 99 - 1) < 0.15 + assert np.abs(np.mean(new) / (num_refs - 1) - 1) < 0.15 -def test_fail_pvalue(): +@pytest.mark.parametrize("num_refs", [20, 100]) +@pytest.mark.parametrize("ndim", [1, 50]) +def test_fail_pvalue(num_refs, ndim): new = [] - for _ in range(100): - y_samples = np.random.normal(size=(500, 50)) - x_samples = np.random.normal(size=(250, 50)) + 0.5 + for _ in range(50): + y_samples = np.random.normal(size=(500, ndim)) + y_samples[:, 0] += 5 # one dim off by 5sigma + x_samples = np.random.normal(size=(250, ndim)) - new.append(pqm_pvalue(x_samples, y_samples)) + new.append(pqm_pvalue(x_samples, y_samples, num_refs=num_refs)) assert np.mean(new) < 1e-3 -def test_fail_chi2(num_refs = 50): +@pytest.mark.parametrize("whiten", [True, False]) +@pytest.mark.parametrize("num_refs", [20, 100]) +@pytest.mark.parametrize("ndim", [1, 50]) +def test_fail_chi2(whiten, num_refs, ndim): new = [] for _ in range(100): - y_samples = np.random.normal(size=(500, 50)) - x_samples = np.random.normal(size=(250, 50)) + 0.5 + y_samples = np.random.normal(size=(500, ndim)) + y_samples[:, 0] += 5 # one dim off by 5sigma + x_samples = np.random.normal(size=(250, ndim)) - new.append(pqm_chi2(x_samples, y_samples, num_refs=num_refs)) + new.append(pqm_chi2(x_samples, y_samples, whiten=whiten, num_refs=num_refs)) new = np.array(new) - assert np.mean(new) / num_refs-1 > 2 + assert (np.mean(new) / (num_refs - 1)) > 1.5 + + +@pytest.mark.parametrize("x_frac", [None, 0.0, 0.5, 1.0]) +@pytest.mark.parametrize("gauss_frac", [0.0, 0.5, 1.0]) +def test_fracs(x_frac, gauss_frac): + x_samples = np.random.normal(size=(500, 50)) + y_samples = np.random.normal(size=(250, 50)) + x_samples[:, 0] += 5 # one dim off by 5sigma + + pval = pqm_pvalue(x_samples, y_samples, x_frac=x_frac, gauss_frac=gauss_frac) + assert pval < 1e-3