Skip to content

Commit

Permalink
Merge pull request #93 from stefanradev93/Development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
stefanradev93 authored Aug 13, 2023
2 parents 26de450 + 18c10fe commit 98d895c
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 2 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,10 @@ General Improvements:
1. Bugfix in ``SetTransformer`` affecting saving and loading when using the version with inducing points.
2. Bugfix in ``SetTransformer`` when using ``train_offline`` and batches result in unequal shapes.
3. Improved documentation with examples

1.1.3 Series
----------

1. Bugfix in ``SimulationMemory`` affecting the use of empty folders for initializing a ``Trainer``
2. Bugfix in ``Trainer.train_from_presimulation()`` for model comparison tasks
3. Added a classifier two-sample test function ``c2st`` in ``computational_utilities``
86 changes: 86 additions & 0 deletions bayesflow/computational_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import tensorflow as tf
from scipy import stats
from sklearn.calibration import calibration_curve
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neural_network import MLPClassifier

from bayesflow.default_settings import MMD_BANDWIDTH_LIST
from bayesflow.exceptions import ShapeError
Expand Down Expand Up @@ -517,3 +519,87 @@ def aggregated_rmse(x_true, x_pred):
return aggregated_error(
x_true=x_true, x_pred=x_pred, inner_error_fun=root_mean_squared_error, outer_aggregation_fun=np.mean
)


def c2st(
source_samples,
target_samples,
n_folds=5,
scoring="accuracy",
normalize=True,
seed=123,
hidden_units_per_dim=16,
aggregate_output=True,
):
"""C2ST metric [1] using an sklearn neural network classifier (i.e., MLP).
Code adapted from https://github.com/sbi-benchmark/sbibm/blob/main/sbibm/metrics/c2st.py
[1] Lopez-Paz, D., & Oquab, M. (2016). Revisiting classifier two-sample tests. arXiv:1610.06545.
Parameters
----------
source_samples : np.ndarray or tf.Tensor
Source samples (e.g., approximate posterior samples)
target_samples : np.ndarray or tf.Tensor
Target samples (e.g., samples from a reference posterior)
n_folds : int, optional, default: 5
Number of folds in k-fold cross-validation for the classifier evaluation
scoring : str, optional, default: "accuracy"
Evaluation score of the sklearn MLP classifier
normalize : bool, optional, default: True
Whether the data shall be z-standardized relative to source_samples
seed : int, optional, default: 123
RNG seed for the MLP and k-fold CV
hidden_units_per_dim : int, optional, default: 16
Number of hidden units in the MLP, relative to the input dimensions.
Example: source samples are 5D, hidden_units_per_dim=16 -> 80 hidden units per layer
aggregate_output : bool, optional, default: True
Whether to return a single value aggregated over all cross-validation runs
or all values from all runs. If left at default, the empirical mean will be returned
Returns
-------
c2st_score : float
The resulting C2ST score
"""

x = np.array(source_samples)
y = np.array(target_samples)

num_dims = x.shape[1]
if not num_dims == y.shape[1]:
raise ShapeError(
f"source_samples and target_samples can have different number of observations (1st dim)"
f"but must have the same dimensionality (2nd dim)"
f"found: source_samples {source_samples.shape[1]}, target_samples {target_samples.shape[1]}"
)

if normalize:
x_mean = np.mean(x, axis=0)
x_std = np.std(x, axis=0)
x = (x - x_mean) / x_std
y = (y - x_mean) / x_std

clf = MLPClassifier(
activation="relu",
hidden_layer_sizes=(hidden_units_per_dim * num_dims, hidden_units_per_dim * num_dims),
max_iter=10000,
solver="adam",
random_state=seed,
)

data = np.concatenate((x, y))
target = np.concatenate(
(
np.zeros((x.shape[0],)),
np.ones((y.shape[0],)),
)
)

shuffle = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
scores = cross_val_score(clf, data, target, cv=shuffle, scoring=scoring)

if aggregate_output:
c2st_score = np.asarray(np.mean(scores)).astype(np.float32)
return c2st_score
2 changes: 1 addition & 1 deletion bayesflow/helper_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,7 @@ def load_from_file(self, file_path):
memory_path = os.path.join(file_path, f"{SimulationMemory.file_name}.pkl")

# Case memory file exists
if os.path.exists(file_path):
if os.path.exists(memory_path):
# Load pickle and fill in attributes
with open(memory_path, "rb") as f:
full_memory_dict = pickle.load(f)
Expand Down
44 changes: 43 additions & 1 deletion tests/test_computational_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import pytest
import numpy as np
from bayesflow import computational_utilities
from bayesflow.exceptions import ArgumentError
from bayesflow.exceptions import ArgumentError, ShapeError
from bayesflow.trainers import Trainer
import tensorflow as tf


@pytest.mark.parametrize("x_true, x_pred, output",
Expand Down Expand Up @@ -93,3 +94,44 @@ def test_aggregated_error(x_true, x_pred, inner_error_fun, outer_aggregation_fun
outer_aggregation_fun=outer_aggregation_fun
)
assert aggregated_error_result == pytest.approx(output)


def test_c2st_shape_error():
source_samples = np.random.random(size=(5, 2))
target_samples = np.random.random(size=(5, 3))
with pytest.raises(ShapeError):
computational_utilities.c2st(source_samples, target_samples)


@pytest.mark.parametrize(
"source_samples, target_samples",
[
(np.random.random((5, 2)), np.random.random((5, 2))),
(np.random.random((10, 2)), np.random.random((5, 2))),
(tf.constant(np.random.random((5, 2))), tf.constant(np.random.random((5, 2))))
]
)
def test_c2st(source_samples, target_samples):
c2st_score = computational_utilities.c2st(source_samples, target_samples)
assert 0.0 <= c2st_score <= 1.0


@pytest.mark.parametrize(
"n_folds, scoring, normalize, seed, hidden_units_per_dim",
[
(3, "accuracy", False, 42, 5),
(7, "f1", True, 12, 10)
]
)
def test_c2st_params(n_folds, scoring, normalize, seed, hidden_units_per_dim):
source_samples = np.random.random((5, 2))
target_samples = np.random.random((10, 2))
_ = computational_utilities.c2st(
source_samples=source_samples,
target_samples=target_samples,
n_folds=n_folds,
scoring=scoring,
normalize=normalize,
seed=seed,
hidden_units_per_dim=hidden_units_per_dim
)

0 comments on commit 98d895c

Please sign in to comment.