Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: correct whitening in HilbertCPCCA models #230

Merged
merged 2 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tests/models/cross/test_hilbert_cpcca.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ def generate_well_conditioned_data(lazy=False):
return X, Y


@pytest.mark.parametrize("use_pca", [True, False])
def test_singular_values(use_pca):
"""Test that the singular values of the Hilbert CCA are less than 1."""
X, Y = generate_well_conditioned_data()
cpcca = HilbertCPCCA(n_modes=2, alpha=0.0, use_pca=use_pca, n_pca_modes=2)
cpcca.fit(X, Y, "sample")
s_values = cpcca.data["singular_values"]

# Singular values are the canonical correlations, so they should be less than 1
assert np.all(s_values <= 1)


# Currently, netCDF4 does not support complex numbers, so skip this test
@pytest.mark.parametrize("engine", ["zarr"])
@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0])
Expand Down
2 changes: 1 addition & 1 deletion tests/models/single/test_pop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_init():
# Assert preprocessor has been initialized
assert hasattr(pop, "_params")
assert hasattr(pop, "preprocessor")
assert hasattr(pop, "whitener")
assert hasattr(pop, "pca")


def test_fit(mock_data_array):
Expand Down
146 changes: 146 additions & 0 deletions tests/preprocessing/test_pca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import numpy as np
import pytest
import xarray as xr

from xeofs.preprocessing import PCA

from ..utilities import (
assert_expected_coords,
assert_expected_dims,
data_is_dask,
)

# =============================================================================
# GENERALLY VALID TEST CASES
# =============================================================================
N_SAMPLE_DIMS = [1]
N_FEATURE_DIMS = [1]
INDEX_POLICY = ["index"]
NAN_POLICY = ["no_nan"]
DASK_POLICY = ["no_dask", "dask"]
SEED = [0]

VALID_TEST_DATA = [
(ns, nf, index, nan, dask)
for ns in N_SAMPLE_DIMS
for nf in N_FEATURE_DIMS
for index in INDEX_POLICY
for nan in NAN_POLICY
for dask in DASK_POLICY
]


def generate_well_conditioned_data(lazy=False):
t = np.linspace(0, 50, 200)
std = 0.1
X = np.sin(t)[:, None] + np.random.normal(0, std, size=(200, 3))
X[:, 1] = X[:, 1] ** 3
X[:, 2] = abs(X[:, 2]) ** (0.5)
X = xr.DataArray(
X,
dims=["sample", "feature"],
coords={"sample": np.arange(200), "feature": np.arange(3)},
name="X",
)
X = X - X.mean("sample")
if lazy:
X = X.chunk({"sample": 5, "feature": -1})
return X


# TESTS
# =============================================================================
@pytest.mark.parametrize("lazy", [False, True])
def test_fit(lazy):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2)
pca.fit(data)


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_transform(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)
pca.fit(data)

# Transform data
transformed_data = pca.transform(data)
transformed_data2 = pca.transform(data)
assert transformed_data.identical(transformed_data2)

assert isinstance(transformed_data, xr.DataArray)
assert transformed_data.ndim == 2
assert transformed_data.dims == ("sample", "feature")

# Consistent dask behaviour
is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(transformed_data)
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_fit_transform(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)

# Transform data
transformed_data = pca.fit_transform(data)
transformed_data2 = pca.transform(data)
assert transformed_data.identical(transformed_data2)

assert isinstance(transformed_data, xr.DataArray)
assert transformed_data.ndim == 2
assert transformed_data.dims == ("sample", "feature")

# Consistent dask behaviour
is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(transformed_data)
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("lazy", [False, True])
@pytest.mark.parametrize("use_pca", [True, False])
def test_invserse_transform_data(lazy, use_pca):
data = generate_well_conditioned_data(lazy)

pca = PCA(n_modes=2, use_pca=use_pca)
pca.fit(data)

transformed = pca.transform(data)
untransformed = pca.inverse_transform_data(transformed)

is_dask_before = data_is_dask(data)
is_dask_after = data_is_dask(untransformed)

# Unstacked data has dimensions of original data
assert_expected_dims(data, untransformed, policy="all")
# Unstacked data has coordinates of original data
assert_expected_coords(data, untransformed, policy="all")
# inverse transform should not change dask-ness
assert is_dask_before == is_dask_after


@pytest.mark.parametrize("n_modes", [1, 2, 3])
def test_transform_pca_n_modes(n_modes):
data = generate_well_conditioned_data()

pca = PCA(use_pca=True, n_modes=n_modes)
transformed = pca.fit_transform(data)

# PCA reduces dimensionality
assert transformed.shape[1] == n_modes


@pytest.mark.parametrize("use_pca", [True, False])
def test_transform_keep_coordinates(use_pca):
X = generate_well_conditioned_data()

pca = PCA(use_pca=use_pca, n_modes="all")
transformed = pca.fit_transform(X)

assert len(transformed.coords) == len(X.coords)
Loading
Loading