From c84c4a4ccc9bf46e15e658e0af3686258589ff8a Mon Sep 17 00:00:00 2001 From: Niclas Rieger Date: Sat, 14 Sep 2024 00:00:45 +0200 Subject: [PATCH 1/2] fix: correct whitening in HilbertCPCCA --- tests/models/cross/test_hilbert_cpcca.py | 12 ++ tests/models/single/test_pop.py | 2 +- tests/preprocessing/test_pca.py | 146 ++++++++++++++++++ tests/preprocessing/test_whitener.py | 182 +++++------------------ xeofs/cross/base_model_cross_set.py | 48 ++++-- xeofs/cross/cpcca.py | 46 ++++-- xeofs/cross/cpcca_rotator.py | 14 +- xeofs/preprocessing/__init__.py | 2 + xeofs/preprocessing/pca.py | 181 ++++++++++++++++++++++ xeofs/preprocessing/whitener.py | 122 +++------------ xeofs/single/pop.py | 23 ++- 11 files changed, 495 insertions(+), 283 deletions(-) create mode 100644 tests/preprocessing/test_pca.py create mode 100644 xeofs/preprocessing/pca.py diff --git a/tests/models/cross/test_hilbert_cpcca.py b/tests/models/cross/test_hilbert_cpcca.py index 02f6455..d30b449 100644 --- a/tests/models/cross/test_hilbert_cpcca.py +++ b/tests/models/cross/test_hilbert_cpcca.py @@ -52,6 +52,18 @@ def generate_well_conditioned_data(lazy=False): return X, Y +@pytest.mark.parametrize("use_pca", [True, False]) +def test_singular_values(use_pca): + """Test that the singular values of the Hilbert CCA are less than 1.""" + X, Y = generate_well_conditioned_data() + cpcca = HilbertCPCCA(n_modes=2, alpha=0.0, use_pca=use_pca, n_pca_modes=2) + cpcca.fit(X, Y, "sample") + s_values = cpcca.data["singular_values"] + + # Singular values are the canonical correlations, so they should be less than 1 + assert np.all(s_values <= 1) + + # Currently, netCDF4 does not support complex numbers, so skip this test @pytest.mark.parametrize("engine", ["zarr"]) @pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0]) diff --git a/tests/models/single/test_pop.py b/tests/models/single/test_pop.py index b42e974..fe9f9b4 100644 --- a/tests/models/single/test_pop.py +++ b/tests/models/single/test_pop.py @@ -12,7 +12,7 @@ def test_init(): # Assert preprocessor has been initialized assert hasattr(pop, "_params") assert hasattr(pop, "preprocessor") - assert hasattr(pop, "whitener") + assert hasattr(pop, "pca") def test_fit(mock_data_array): diff --git a/tests/preprocessing/test_pca.py b/tests/preprocessing/test_pca.py new file mode 100644 index 0000000..dec39f5 --- /dev/null +++ b/tests/preprocessing/test_pca.py @@ -0,0 +1,146 @@ +import numpy as np +import pytest +import xarray as xr + +from xeofs.preprocessing import PCA + +from ..utilities import ( + assert_expected_coords, + assert_expected_dims, + data_is_dask, +) + +# ============================================================================= +# GENERALLY VALID TEST CASES +# ============================================================================= +N_SAMPLE_DIMS = [1] +N_FEATURE_DIMS = [1] +INDEX_POLICY = ["index"] +NAN_POLICY = ["no_nan"] +DASK_POLICY = ["no_dask", "dask"] +SEED = [0] + +VALID_TEST_DATA = [ + (ns, nf, index, nan, dask) + for ns in N_SAMPLE_DIMS + for nf in N_FEATURE_DIMS + for index in INDEX_POLICY + for nan in NAN_POLICY + for dask in DASK_POLICY +] + + +def generate_well_conditioned_data(lazy=False): + t = np.linspace(0, 50, 200) + std = 0.1 + X = np.sin(t)[:, None] + np.random.normal(0, std, size=(200, 3)) + X[:, 1] = X[:, 1] ** 3 + X[:, 2] = abs(X[:, 2]) ** (0.5) + X = xr.DataArray( + X, + dims=["sample", "feature"], + coords={"sample": np.arange(200), "feature": np.arange(3)}, + name="X", + ) + X = X - X.mean("sample") + if lazy: + X = X.chunk({"sample": 5, "feature": -1}) + return X + + +# TESTS +# ============================================================================= +@pytest.mark.parametrize("lazy", [False, True]) +def test_fit(lazy): + data = generate_well_conditioned_data(lazy) + + pca = PCA(n_modes=2) + pca.fit(data) + + +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("use_pca", [True, False]) +def test_transform(lazy, use_pca): + data = generate_well_conditioned_data(lazy) + + pca = PCA(n_modes=2, use_pca=use_pca) + pca.fit(data) + + # Transform data + transformed_data = pca.transform(data) + transformed_data2 = pca.transform(data) + assert transformed_data.identical(transformed_data2) + + assert isinstance(transformed_data, xr.DataArray) + assert transformed_data.ndim == 2 + assert transformed_data.dims == ("sample", "feature") + + # Consistent dask behaviour + is_dask_before = data_is_dask(data) + is_dask_after = data_is_dask(transformed_data) + assert is_dask_before == is_dask_after + + +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("use_pca", [True, False]) +def test_fit_transform(lazy, use_pca): + data = generate_well_conditioned_data(lazy) + + pca = PCA(n_modes=2, use_pca=use_pca) + + # Transform data + transformed_data = pca.fit_transform(data) + transformed_data2 = pca.transform(data) + assert transformed_data.identical(transformed_data2) + + assert isinstance(transformed_data, xr.DataArray) + assert transformed_data.ndim == 2 + assert transformed_data.dims == ("sample", "feature") + + # Consistent dask behaviour + is_dask_before = data_is_dask(data) + is_dask_after = data_is_dask(transformed_data) + assert is_dask_before == is_dask_after + + +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("use_pca", [True, False]) +def test_invserse_transform_data(lazy, use_pca): + data = generate_well_conditioned_data(lazy) + + pca = PCA(n_modes=2, use_pca=use_pca) + pca.fit(data) + + transformed = pca.transform(data) + untransformed = pca.inverse_transform_data(transformed) + + is_dask_before = data_is_dask(data) + is_dask_after = data_is_dask(untransformed) + + # Unstacked data has dimensions of original data + assert_expected_dims(data, untransformed, policy="all") + # Unstacked data has coordinates of original data + assert_expected_coords(data, untransformed, policy="all") + # inverse transform should not change dask-ness + assert is_dask_before == is_dask_after + + +@pytest.mark.parametrize("n_modes", [1, 2, 3]) +def test_transform_pca_n_modes(n_modes): + data = generate_well_conditioned_data() + + pca = PCA(use_pca=True, n_modes=n_modes) + transformed = pca.fit_transform(data) + + # PCA reduces dimensionality + assert transformed.shape[1] == n_modes + + +@pytest.mark.parametrize("use_pca", [True, False]) +def test_transform_keep_coordinates(use_pca): + X = generate_well_conditioned_data() + + pca = PCA(use_pca=use_pca, n_modes="all") + transformed = pca.fit_transform(X) + + assert len(transformed.coords) == len(X.coords) diff --git a/tests/preprocessing/test_whitener.py b/tests/preprocessing/test_whitener.py index a1511b9..54b82ea 100644 --- a/tests/preprocessing/test_whitener.py +++ b/tests/preprocessing/test_whitener.py @@ -60,43 +60,51 @@ def generate_well_conditioned_data(lazy=False): def test_fit(lazy): data = generate_well_conditioned_data(lazy) - whitener = Whitener(n_modes=2) + whitener = Whitener() whitener.fit(data) -@pytest.mark.parametrize( - "lazy", - [False, True], -) -def test_transform(lazy): +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0, 1.5]) +def test_transform(lazy, alpha): data = generate_well_conditioned_data(lazy) - whitener = Whitener(n_modes=2) + whitener = Whitener() whitener.fit(data) # Transform data transformed_data = whitener.transform(data) transformed_data2 = whitener.transform(data) - assert transformed_data.identical(transformed_data2) assert isinstance(transformed_data, xr.DataArray) assert transformed_data.ndim == 2 assert transformed_data.dims == ("sample", "feature") + # Transformed data is identical + assert transformed_data.identical(transformed_data2) + + # Check that for full whitening, transformed data is uncorrelated + # and has variance one + if math.isclose(alpha, 0.0, abs_tol=1e-6): + # Transformed data has variance = 1 + assert np.allclose(transformed_data.var("sample").values, 1.0) + + # Transformed data is uncorrelated + C = np.corrcoef(transformed_data.values, rowvar=False) + target = np.identity(data.shape[1]) + np.testing.assert_allclose(C, target, atol=1e-6) + # Consistent dask behaviour is_dask_before = data_is_dask(data) is_dask_after = data_is_dask(transformed_data) assert is_dask_before == is_dask_after -@pytest.mark.parametrize( - "lazy", - [False, True], -) +@pytest.mark.parametrize("lazy", [False, True]) def test_fit_transform(lazy): data = generate_well_conditioned_data(lazy) - whitener = Whitener(n_modes=2) + whitener = Whitener() # Transform data transformed_data = whitener.fit_transform(data) @@ -113,14 +121,12 @@ def test_fit_transform(lazy): assert is_dask_before == is_dask_after -@pytest.mark.parametrize( - "lazy", - [False, True], -) -def test_invserse_transform_data(lazy): +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0, 1.5]) +def test_inverse_transform_data(lazy, alpha): data = generate_well_conditioned_data(lazy) - whitener = Whitener(n_modes=2) + whitener = Whitener(alpha) whitener.fit(data) whitened_data = whitener.transform(data) @@ -136,55 +142,8 @@ def test_invserse_transform_data(lazy): # inverse transform should not change dask-ness assert is_dask_before == is_dask_after - -@pytest.mark.parametrize( - "alpha,use_pca", - [ - (0.0, False), - (0.5, False), - (1.0, False), - (1.5, False), - (0.0, True), - (0.5, True), - (1.0, True), - (1.5, True), - ], -) -def test_transform_alpha(alpha, use_pca): - data = data = generate_well_conditioned_data() - - whitener = Whitener(alpha=alpha, use_pca=use_pca) - data_whitened = whitener.fit_transform(data) - - nc = data.shape[0] - 1 - norm = (data_whitened**2).sum("sample") / nc - ones = norm / norm - # Check that for alpha=0 full whitening is performed - if math.isclose(alpha, 0.0, abs_tol=1e-6): - xr.testing.assert_allclose(norm, ones, atol=1e-6) - - -@pytest.mark.parametrize( - "alpha,use_pca", - [ - (0.0, False), - (0.5, False), - (1.0, False), - (1.5, False), - (0.0, True), - (0.5, True), - (1.0, True), - (1.5, True), - ], -) -def test_inverse_transform_alpha(alpha, use_pca): - # Use data with more samples than features and high signal-to-noise ratio - data = generate_well_conditioned_data() - whitener = Whitener(alpha=alpha, use_pca=use_pca) - data_whitened = whitener.fit_transform(data) - data_unwhitened = whitener.inverse_transform_data(data_whitened) - - xr.testing.assert_allclose(data, data_unwhitened, atol=1e-6) + # Unwhitened data is identical to original data + xr.testing.assert_allclose(data, unwhitened_data, atol=1e-6) def test_invalid_alpha(): @@ -193,7 +152,7 @@ def test_invalid_alpha(): err_msg = "`alpha` must be greater than or equal to 0" with pytest.raises(ValueError, match=err_msg): - Whitener(n_modes=2, alpha=-1.0) + Whitener(alpha=-1.0) def test_raise_warning_ill_conditioned(): @@ -207,24 +166,10 @@ def test_raise_warning_ill_conditioned(): _ = whitener.fit_transform(data) -@pytest.mark.parametrize( - "n_modes", - [1, 2, 3], -) -def test_transform_pca_n_modes(n_modes): - data = generate_well_conditioned_data() - - whitener = Whitener(use_pca=True, n_modes=n_modes) - transformed = whitener.fit_transform(data) - - # PCA-Whitener reduces dimensionality - assert transformed.shape[1] == n_modes - - def test_whitener_identity_transformation(): data = generate_well_conditioned_data() - whitener = Whitener(alpha=1.0, use_pca=False) + whitener = Whitener(alpha=1.0) transformed = whitener.fit_transform(data) reconstructed = whitener.inverse_transform_data(transformed) @@ -233,20 +178,8 @@ def test_whitener_identity_transformation(): xr.testing.assert_identical(data, reconstructed) -@pytest.mark.parametrize( - "alpha,use_pca", - [ - (0.0, True), - (0.5, True), - (1.0, True), - (1.5, True), - (0.0, False), - (0.5, False), - (1.0, False), - (1.5, False), - ], -) -def test_unwhiten_cross_covariance_matrix(alpha, use_pca): +@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0, 1.5]) +def test_unwhiten_cross_covariance_matrix(alpha): def unwhiten_cov_mat(Cw, S1_inv, S2_inv): n_dims_s1 = len(S1_inv.shape) n_dims_s2 = len(S2_inv.shape) @@ -254,8 +187,6 @@ def unwhiten_cov_mat(Cw, S1_inv, S2_inv): match n_dims_s1: case 0: C_unwhitened = Cw - case 1: - C_unwhitened = S1_inv[:, None] * Cw case 2: C_unwhitened = S1_inv @ Cw case _: @@ -264,8 +195,6 @@ def unwhiten_cov_mat(Cw, S1_inv, S2_inv): match n_dims_s2: case 0: pass - case 1: - C_unwhitened = C_unwhitened * S2_inv[None, :] case 2: C_unwhitened = C_unwhitened @ S2_inv case _: @@ -273,18 +202,18 @@ def unwhiten_cov_mat(Cw, S1_inv, S2_inv): return C_unwhitened - """Test that we can uncover the original total amount of covariance between two datasets after whitening.""" + """Test that we can uncover the original total amount of squared covariance between two datasets after whitening.""" data1 = generate_well_conditioned_data() data2 = generate_well_conditioned_data() ** 2 - whitener1 = Whitener(alpha=0.5, use_pca=True, n_modes="all") - whitener2 = Whitener(alpha=alpha, use_pca=use_pca, n_modes="all") + whitener1 = Whitener(alpha=0.5) + whitener2 = Whitener(alpha=alpha) transformed1 = whitener1.fit_transform(data1) transformed2 = whitener2.fit_transform(data2) - S1_inv = whitener1.get_Tinv(unwhiten_only=True).values - S2_inv = whitener2.get_Tinv(unwhiten_only=True).values + S1_inv = whitener1.Tinv.values + S2_inv = whitener2.Tinv.values C = data1.values.T @ data2.values Cw = transformed1.values.T @ transformed2.values @@ -296,44 +225,11 @@ def unwhiten_cov_mat(Cw, S1_inv, S2_inv): np.testing.assert_almost_equal(total_covariance, total_covariance_rec) -@pytest.mark.parametrize( - "use_pca", - [True, False], -) -def test_standardize_and_decorrelate(use_pca): - X = generate_well_conditioned_data() - n_features = X.shape[1] - - whitener = Whitener(alpha=0.0, use_pca=use_pca, n_modes="all") - transformed = whitener.fit_transform(X) - - # Check that transformed data is standardized - assert np.allclose(transformed.mean("sample").values, np.zeros(n_features)) - assert np.allclose(transformed.std("sample").values, np.ones(n_features), rtol=1e-2) - - # Check that transformed data is decorrelated - C = np.corrcoef(transformed.values, rowvar=False) - target = np.identity(n_features) - np.testing.assert_allclose(C, target, atol=1e-6) - - -@pytest.mark.parametrize( - "alpha,use_pca", - [ - (0.0, True), - (0.5, True), - (1.0, True), - (1.5, True), - (0.0, False), - (0.5, False), - (1.0, False), - (1.5, False), - ], -) -def test_transform_keep_coordinates(alpha, use_pca): +@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0, 1.5]) +def test_transform_keep_coordinates(alpha): X = generate_well_conditioned_data() - whitener = Whitener(alpha=alpha, use_pca=use_pca, n_modes="all") + whitener = Whitener(alpha=alpha) transformed = whitener.fit_transform(X) assert len(transformed.coords) == len(X.coords) diff --git a/xeofs/cross/base_model_cross_set.py b/xeofs/cross/base_model_cross_set.py index 352106e..f86b63c 100644 --- a/xeofs/cross/base_model_cross_set.py +++ b/xeofs/cross/base_model_cross_set.py @@ -6,8 +6,7 @@ from ..base_model import BaseModel from ..data_container import DataContainer -from ..preprocessing.preprocessor import Preprocessor -from ..preprocessing.whitener import Whitener +from ..preprocessing import PCA, Preprocessor, Whitener from ..utils.data_types import DataArray, DataObject, GenericType from ..utils.sanity_checks import validate_input_type from ..utils.xarray_utils import convert_to_dim_type @@ -163,19 +162,29 @@ def __init__( compute=compute, ) - self.whitener1 = Whitener( - alpha=alpha[0], - use_pca=use_pca[0], + self.pca1 = PCA( n_modes=n_pca_modes[0], init_rank_reduction=pca_init_rank_reduction[0], + use_pca=use_pca[0], sample_name=sample_name, feature_name=feature_name[0], ) - self.whitener2 = Whitener( - alpha=alpha[1], - use_pca=use_pca[1], + + self.pca2 = PCA( n_modes=n_pca_modes[1], init_rank_reduction=pca_init_rank_reduction[1], + use_pca=use_pca[1], + sample_name=sample_name, + feature_name=feature_name[1], + ) + + self.whitener1 = Whitener( + alpha=alpha[0], + sample_name=sample_name, + feature_name=feature_name[0], + ) + self.whitener2 = Whitener( + alpha=alpha[1], sample_name=sample_name, feature_name=feature_name[1], ) @@ -294,11 +303,14 @@ def fit( # Preprocess data X = self.preprocessor1.fit_transform(X, self.sample_dims, weights_X) Y = self.preprocessor2.fit_transform(Y, self.sample_dims, weights_Y) + # Perform PCA + X = self.pca1.fit_transform(X) + Y = self.pca2.fit_transform(Y) + # Augment data + X, Y = self._augment_data(X, Y) # Whiten data X = self.whitener1.fit_transform(X) Y = self.whitener2.fit_transform(Y) - # Augment data - X, Y = self._augment_data(X, Y) # Fit the model self._fit_algorithm(X, Y) @@ -334,21 +346,25 @@ def transform( validate_input_type(X) # Preprocess X X = self.preprocessor1.transform(X) + X = self.pca1.transform(X) X = self.whitener1.transform(X) if Y is not None: validate_input_type(Y) # Preprocess Y Y = self.preprocessor2.transform(Y) + Y = self.pca2.transform(Y) Y = self.whitener2.transform(Y) data = self._transform_algorithm(X, Y, normalized=normalized) data_list = [] if X is not None: X = self.whitener1.inverse_transform_scores_unseen(data["X"]) + X = self.pca1.inverse_transform_scores_unseen(X) X = self.preprocessor1.inverse_transform_scores_unseen(X) data_list.append(X) if Y is not None: Y = self.whitener2.inverse_transform_scores_unseen(data["Y"]) + Y = self.pca2.inverse_transform_scores_unseen(Y) Y = self.preprocessor2.inverse_transform_scores_unseen(Y) data_list.append(Y) @@ -397,11 +413,13 @@ def inverse_transform( if x_is_given: X = inv_transformed["X"] X = self.whitener1.inverse_transform_data(X) + X = self.pca1.inverse_transform_data(X) Xrec = self.preprocessor1.inverse_transform_data(X) results.append(Xrec) if y_is_given: Y = inv_transformed["Y"] Y = self.whitener2.inverse_transform_data(Y) + Y = self.pca2.inverse_transform_data(Y) Yrec = self.preprocessor2.inverse_transform_data(Y) results.append(Yrec) @@ -429,6 +447,7 @@ def predict(self, X: DataObject) -> DataArray: # Preprocess X X = self.preprocessor1.transform(X) + X = self.pca1.transform(X) # Whiten X X = self.whitener1.transform(X) @@ -438,6 +457,7 @@ def predict(self, X: DataObject) -> DataArray: # Inverse transform Y Y = self.whitener2.inverse_transform_scores_unseen(Y) + Y = self.pca2.inverse_transform_scores_unseen(Y) Y = self.preprocessor2.inverse_transform_scores_unseen(Y) return Y @@ -465,6 +485,9 @@ def components(self, normalized=True) -> tuple[DataObject, DataObject]: Px = self.whitener1.inverse_transform_components(Px) Py = self.whitener2.inverse_transform_components(Py) + Px = self.pca1.inverse_transform_components(Px) + Py = self.pca2.inverse_transform_components(Py) + Px: DataObject = self.preprocessor1.inverse_transform_components(Px) Py: DataObject = self.preprocessor2.inverse_transform_components(Py) return Px, Py @@ -492,6 +515,9 @@ def scores(self, normalized=False) -> tuple[DataArray, DataArray]: Rx = self.whitener1.inverse_transform_scores(Rx) Ry = self.whitener2.inverse_transform_scores(Ry) + Rx = self.pca1.inverse_transform_scores(Rx) + Ry = self.pca2.inverse_transform_scores(Ry) + Rx: DataArray = self.preprocessor1.inverse_transform_scores(Rx) Ry: DataArray = self.preprocessor2.inverse_transform_scores(Ry) return Rx, Ry @@ -509,6 +535,8 @@ def get_serialization_attrs(self) -> dict: data=self.data, preprocessor1=self.preprocessor1, preprocessor2=self.preprocessor2, + pca1=self.pca1, + pca2=self.pca2, whitener1=self.whitener1, whitener2=self.whitener2, sample_name=self.sample_name, diff --git a/xeofs/cross/cpcca.py b/xeofs/cross/cpcca.py index ad88808..d6cf60f 100644 --- a/xeofs/cross/cpcca.py +++ b/xeofs/cross/cpcca.py @@ -453,8 +453,8 @@ def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): X2 = self.data["input_data2"] # Unwhiten the data - X1 = self.whitener1.inverse_transform_data(X1, unwhiten_only=True) - X2 = self.whitener2.inverse_transform_data(X2, unwhiten_only=True) + X1 = self.whitener1.inverse_transform_data(X1) + X2 = self.whitener2.inverse_transform_data(X2) # Rename the sample dimension to avoid conflicts for # different coordinates with same length @@ -477,8 +477,8 @@ def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): ) # Unwhitend the reconstructed data - X1r = self.whitener1.inverse_transform_data(X1r, unwhiten_only=True) - X2r = self.whitener2.inverse_transform_data(X2r, unwhiten_only=True) + X1r = self.whitener1.inverse_transform_data(X1r) + X2r = self.whitener2.inverse_transform_data(X2r) # Compute fraction variance explained X1r = X1r.rename({self.sample_name: sample_name_x}) @@ -536,7 +536,7 @@ def fraction_variance_X_explained_by_X(self): X = self.data["input_data1"] # Unwhiten the data - X = self.whitener1.inverse_transform_data(X, unwhiten_only=True) + X = self.whitener1.inverse_transform_data(X) # Compute the total variance total_variance: DataArray = self._compute_total_variance(X, self.sample_name) @@ -551,7 +551,7 @@ def fraction_variance_X_explained_by_X(self): Xr = xr.dot(Rx.sel(mode=[mode]), Qx.sel(mode=[mode]).conj().T, dims="mode") # Unwhitend the reconstructed data - Xr = self.whitener1.inverse_transform_data(Xr, unwhiten_only=True) + Xr = self.whitener1.inverse_transform_data(Xr) # Compute fraction variance explained residual_variance = self._compute_total_variance(X - Xr, self.sample_name) @@ -588,7 +588,7 @@ def fraction_variance_Y_explained_by_Y(self): Y = self.data["input_data2"] # Unwhiten the data - Y = self.whitener2.inverse_transform_data(Y, unwhiten_only=True) + Y = self.whitener2.inverse_transform_data(Y) # Compute the total variance total_variance: DataArray = self._compute_total_variance(Y, self.sample_name) @@ -603,7 +603,7 @@ def fraction_variance_Y_explained_by_Y(self): Yr = xr.dot(Ry.sel(mode=[mode]), Qy.sel(mode=[mode]).conj().T, dims="mode") # Unwhitend the reconstructed data - Yr = self.whitener2.inverse_transform_data(Yr, unwhiten_only=True) + Yr = self.whitener2.inverse_transform_data(Yr) # Compute fraction variance explained residual_variance = self._compute_total_variance(Y - Yr, self.sample_name) @@ -660,8 +660,8 @@ def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): X2 = self.data["input_data2"] # Unwhiten the data - X1 = self.whitener1.inverse_transform_data(X1, unwhiten_only=True) - X2 = self.whitener2.inverse_transform_data(X2, unwhiten_only=True) + X1 = self.whitener1.inverse_transform_data(X1) + X2 = self.whitener2.inverse_transform_data(X2) # Compute the total variance X1 = X1.rename({self.sample_name: sample_name_x}) @@ -694,8 +694,8 @@ def _compute_residual_variance_numpy(X, Y, Xrec, Yrec): ) # Unwhitend the reconstructed data - X1r = self.whitener1.inverse_transform_data(X1r, unwhiten_only=True) - X2r = self.whitener2.inverse_transform_data(X2r, unwhiten_only=True) + X1r = self.whitener1.inverse_transform_data(X1r) + X2r = self.whitener2.inverse_transform_data(X2r) # Compute fraction variance explained X1r = X1r.rename({self.sample_name: sample_name_x}) @@ -770,6 +770,9 @@ def homogeneous_patterns(self, correction=None, alpha=0.05): input_data1 = self.whitener1.inverse_transform_data(input_data1) input_data2 = self.whitener2.inverse_transform_data(input_data2) + input_data1 = self.pca1.inverse_transform_data(input_data1) + input_data2 = self.pca2.inverse_transform_data(input_data2) + scores1 = self.data["scores1"] scores2 = self.data["scores2"] @@ -851,6 +854,9 @@ def heterogeneous_patterns(self, correction=None, alpha=0.05): input_data1 = self.whitener1.inverse_transform_data(input_data1) input_data2 = self.whitener2.inverse_transform_data(input_data2) + input_data1 = self.pca1.inverse_transform_data(input_data1) + input_data2 = self.pca2.inverse_transform_data(input_data2) + scores1 = self.data["scores1"] scores2 = self.data["scores2"] @@ -982,8 +988,8 @@ def _compute_total_squared_covariance(self, C: DataArray) -> DataArray: Requires the unwhitened covariance matrix which we can obtain by multiplying the whitened covariance matrix with the inverse of the whitening transformation matrix. """ - C = self.whitener2.inverse_transform_data(C, unwhiten_only=True) - C = self.whitener1.inverse_transform_data(C.conj().T, unwhiten_only=True) + C = self.whitener2.inverse_transform_data(C) + C = self.whitener1.inverse_transform_data(C.conj().T) # Not necessary to conjugate transpose for total squared covariance # C = C.conj().T return (abs(C) ** 2).sum() @@ -1183,6 +1189,9 @@ def components_amplitude(self, normalized=True) -> tuple[DataObject, DataObject] Px = self.whitener1.inverse_transform_components(Px) Py = self.whitener2.inverse_transform_components(Py) + Px = self.pca1.inverse_transform_components(Px) + Py = self.pca2.inverse_transform_components(Py) + Px = abs(Px) Py = abs(Py) @@ -1218,6 +1227,9 @@ def components_phase(self, normalized=True) -> tuple[DataObject, DataObject]: Px = self.whitener1.inverse_transform_components(Px) Py = self.whitener2.inverse_transform_components(Py) + Px = self.pca1.inverse_transform_components(Px) + Py = self.pca2.inverse_transform_components(Py) + Px = xr.apply_ufunc(np.angle, Px, keep_attrs=True, dask="allowed") Py = xr.apply_ufunc(np.angle, Py, keep_attrs=True, dask="allowed") @@ -1253,6 +1265,9 @@ def scores_amplitude(self, normalized=False) -> tuple[DataArray, DataArray]: Rx = self.whitener1.inverse_transform_scores(Rx) Ry = self.whitener2.inverse_transform_scores(Ry) + Rx = self.pca1.inverse_transform_scores(Rx) + Ry = self.pca2.inverse_transform_scores(Ry) + Rx = abs(Rx) Ry = abs(Ry) @@ -1288,6 +1303,9 @@ def scores_phase(self, normalized=False) -> tuple[DataArray, DataArray]: Rx = self.whitener1.inverse_transform_scores(Rx) Ry = self.whitener2.inverse_transform_scores(Ry) + Rx = self.pca1.inverse_transform_scores(Rx) + Ry = self.pca2.inverse_transform_scores(Ry) + Rx = xr.apply_ufunc(np.angle, Rx, keep_attrs=True, dask="allowed") Ry = xr.apply_ufunc(np.angle, Ry, keep_attrs=True, dask="allowed") diff --git a/xeofs/cross/cpcca_rotator.py b/xeofs/cross/cpcca_rotator.py index 218759c..b83b915 100644 --- a/xeofs/cross/cpcca_rotator.py +++ b/xeofs/cross/cpcca_rotator.py @@ -7,7 +7,7 @@ from ..base_model import BaseModel from ..data_container import DataContainer from ..linalg.rotation import promax -from ..preprocessing import Preprocessor, Whitener +from ..preprocessing import PCA, Preprocessor, Whitener from ..utils.data_types import DataArray, DataObject from ..utils.xarray_utils import argsort_dask, get_deterministic_sign_multiplier from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA @@ -95,6 +95,8 @@ def __init__( # Attach empty objects self.preprocessor1 = Preprocessor() self.preprocessor2 = Preprocessor() + self.pca1 = PCA() + self.pca2 = PCA() self.whitener1 = Whitener() self.whitener2 = Whitener() self.data = DataContainer() @@ -108,6 +110,8 @@ def get_serialization_attrs(self) -> dict: model_data=self.model_data, preprocessor1=self.preprocessor1, preprocessor2=self.preprocessor2, + pca1=self.pca1, + pca2=self.pca2, whitener1=self.whitener1, whitener2=self.whitener2, sorted=self.sorted, @@ -118,6 +122,8 @@ def get_serialization_attrs(self) -> dict: def _fit_algorithm(self, model) -> Self: self.preprocessor1 = model.preprocessor1 self.preprocessor2 = model.preprocessor2 + self.pca1 = model.pca1 + self.pca2 = model.pca2 self.whitener1 = model.whitener1 self.whitener2 = model.whitener2 self.sample_name = model.sample_name @@ -154,6 +160,8 @@ def _fit_algorithm(self, model) -> Self: # Unwhiten and back-transform into physical space Qx = self.whitener1.inverse_transform_components(Qx) Qy = self.whitener2.inverse_transform_components(Qy) + Qx = self.pca1.inverse_transform_components(Qx) + Qy = self.pca2.inverse_transform_components(Qy) # Rename the feature dimension to a common name so that the combined vectors can be concatenated Qx = Qx.rename({feature_name[0]: common_feature_dim}) @@ -194,6 +202,8 @@ def _fit_algorithm(self, model) -> Self: # For consistency with the unrotated model classes, we transform the pattern vectors # into the whitened PC space + Qx_rot = self.pca1.transform_components(Qx_rot) + Qy_rot = self.pca2.transform_components(Qy_rot) Qx_rot = self.whitener1.transform_components(Qx_rot) Qy_rot = self.whitener2.transform_components(Qy_rot) @@ -353,6 +363,7 @@ def transform( # Preprocess the data comps1 = self.whitener1.inverse_transform_components(comps1) + comps1 = self.pca1.inverse_transform_components(comps1) X = self.preprocessor1.transform(X) # Compute non-rotated scores by projecting the data onto non-rotated components @@ -383,6 +394,7 @@ def transform( # Preprocess the data comps2 = self.whitener2.inverse_transform_components(comps2) + comps2 = self.pca2.inverse_transform_components(comps2) Y = self.preprocessor2.transform(Y) # Compute non-rotated scores by project the data onto non-rotated components diff --git a/xeofs/preprocessing/__init__.py b/xeofs/preprocessing/__init__.py index 826949b..abec6c9 100644 --- a/xeofs/preprocessing/__init__.py +++ b/xeofs/preprocessing/__init__.py @@ -1,6 +1,7 @@ from .concatenator import Concatenator from .dimension_renamer import DimensionRenamer from .multi_index_converter import MultiIndexConverter +from .pca import PCA from .preprocessor import Preprocessor from .sanitizer import Sanitizer from .scaler import Scaler @@ -16,4 +17,5 @@ "Scaler", "Stacker", "Whitener", + "PCA", ] diff --git a/xeofs/preprocessing/pca.py b/xeofs/preprocessing/pca.py new file mode 100644 index 0000000..9de9e3d --- /dev/null +++ b/xeofs/preprocessing/pca.py @@ -0,0 +1,181 @@ +import numpy as np +import xarray as xr +from typing_extensions import Self + +from ..linalg.svd import SVD +from ..utils.data_types import ( + DataArray, + Dims, + DimsList, +) +from ..utils.sanity_checks import assert_single_dataarray +from .transformer import Transformer + + +class PCA(Transformer): + """Transform data into reduced PC space. + + For large number of features, use PCA to reduce the dimensionality of the data before whitening. + + Note that for ``alpha=1`` (no whiteneing) and ``use_pca=False``, it just becomes the identity transformation. + + Parameters + ---------- + n_modes: int | float | str, default="all" + If int, number of components to keep. If float, fraction of variance to keep. If `n_modes="all"`, keep all components. + use_pca: bool, default=True + Whether or not to use PCA to reduce the dimensionality of the data. If False, perform identity transformation. + init_rank_reduction: float, default=0.3 + Used only when `n_modes` is given as a float. Specifiy the initial PCA rank reduction before truncating the solution to the desired fraction of explained variance. Must be in the half open interval ]0, 1]. Lower values will speed up the computation. + compute_eagerly: bool, default=False + Whether to perform eager or lazy computation. + sample_name: str, default="sample" + Name of the sample dimension. + feature_name: str, default="feature" + Name of the feature dimension. + random_state: np.random.Generator | int | None, default=None + Random seed for reproducibility. + solver_kwargs: dict + Additional keyword arguments for the SVD solver. + + """ + + def __init__( + self, + n_modes: int | float | str = "all", + use_pca: bool = True, + init_rank_reduction: float = 0.3, + compute_eagerly: bool = False, + sample_name: str = "sample", + feature_name: str = "feature", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + super().__init__(sample_name, feature_name) + + self.use_pca = use_pca + self.n_modes = n_modes + self.init_rank_reduction = init_rank_reduction + self.compute_eagerly = compute_eagerly + self.random_state = random_state + self.solver_kwargs = solver_kwargs + + # Check whether Whitener is identity transformation + self.is_identity = not use_pca + + def _sanity_check_input(self, X) -> None: + assert_single_dataarray(X) + + if len(X.dims) != 2: + raise ValueError("Input DataArray must have shape 2") + + if X.dims != (self.sample_name, self.feature_name): + raise ValueError( + "Input DataArray must have dimensions ({:}, {:})".format( + self.sample_name, self.feature_name + ) + ) + + def _get_n_modes(self, X: DataArray) -> int | float: + if isinstance(self.n_modes, str): + if self.n_modes == "all": + return min(X.shape) + else: + raise ValueError("`n_modes` must be an integer, float or 'all'") + else: + return self.n_modes + + def get_serialization_attrs(self) -> dict: + return dict( + V=self.V, + use_pca=self.use_pca, + ) + + def fit( + self, + X: DataArray, + sample_dims: Dims | None = None, + feature_dims: DimsList | None = None, + ) -> Self: + self._sanity_check_input(X) + + if self.use_pca: + # In case of "all" modes to the rank of the input data + self.n_modes = self._get_n_modes(X) + + svd = SVD( + n_modes=self.n_modes, + init_rank_reduction=self.init_rank_reduction, + compute=self.compute_eagerly, + random_state=self.random_state, + sample_name=self.sample_name, + feature_name=self.feature_name, + **self.solver_kwargs, + ) + _, _, self.V = svd.fit_transform(X) + + else: + self.V = xr.DataArray(1, name="identity") + + return self + + def transform(self, X: DataArray) -> DataArray: + """Transform new data into the PC space.""" + + self._sanity_check_input(X) + if self.use_pca: + transformed = xr.dot(X, self.V, dims=self.feature_name) + transformed.name = X.name + return transformed.rename({"mode": self.feature_name}) + else: + return X + + def fit_transform( + self, + X: DataArray, + sample_dims: Dims | None = None, + feature_dims: DimsList | None = None, + ) -> DataArray: + return self.fit(X, sample_dims, feature_dims).transform(X) + + def inverse_transform_data(self, X: DataArray) -> DataArray: + """Transform 2D data (sample x feature) from PC space back into original space.""" + if self.use_pca: + X = X.rename({self.feature_name: "mode"}) + return xr.dot(X, self.V.conj().T, dims="mode") + else: + return X + + def transform_components(self, X: DataArray) -> DataArray: + """Transform 2D components (feature x mode) into PC space.""" + + if self.use_pca: + dummy_dim = "dummy_dim" + Tinv = self.V.conj().T + Tinv = Tinv.rename({"mode": dummy_dim}) + transformed = xr.dot(Tinv, X, dims=self.feature_name) + return transformed.rename({dummy_dim: self.feature_name}) + else: + return X + + def inverse_transform_components(self, X: DataArray) -> DataArray: + """Transform 2D components (feature x mode) from PC space back into original space.""" + + if self.use_pca: + dummy_dim = "dummy_dim" + comps_pc_space = X.rename({self.feature_name: dummy_dim}) + V = self.V + V = V.rename({"mode": dummy_dim}) + return xr.dot(V, comps_pc_space, dims=dummy_dim) + else: + return X + + def inverse_transform_scores(self, X: DataArray) -> DataArray: + """Transform 2D scores (sample x mode) from whitened PC space back into original space.""" + + return X + + def inverse_transform_scores_unseen(self, X: DataArray) -> DataArray: + """Transform unseen 2D scores (sample x mode) from whitened PC space back into original space.""" + + return X diff --git a/xeofs/preprocessing/whitener.py b/xeofs/preprocessing/whitener.py index 4bf7660..2e03398 100644 --- a/xeofs/preprocessing/whitener.py +++ b/xeofs/preprocessing/whitener.py @@ -5,7 +5,6 @@ from typing_extensions import Self from ..linalg._numpy import _fractional_matrix_power -from ..linalg.svd import SVD from ..utils.data_types import ( DataArray, Dims, @@ -18,44 +17,26 @@ class Whitener(Transformer): """Fractional whitening of 2D DataArray. - For large number of features, use PCA to reduce the dimensionality of the data before whitening. - - Note that for ``alpha=1`` (no whiteneing) and ``use_pca=False``, it just becomes the identity transformation. + For ``alpha=1`` (no whiteneing), it just becomes the identity transformation. Parameters ---------- alpha: float, default=0 Power parameter to perform fractional whitening, where 0 corresponds to full whitening (standardized and decorrelated) and 1 to no whitening. - use_pca: bool, default=False - If True, perform PCA before whitening to speed up the computation. This is the recommended setting for large number of features. Specify the number of components to keep in `n_modes`. - n_modes: int | float | str, default=None - If int, number of components to keep. If float, fraction of variance to keep. If `n_modes="all"`, keep all components. - init_rank_reduction: float, default=0.3 - Used only when `n_modes` is given as a float. Specifiy the initial PCA rank reduction before truncating the solution to the desired fraction of explained variance. Must be in the half open interval ]0, 1]. Lower values will speed up the computation. - compute_svd: bool, default=False - Whether to perform eager or lazy computation. sample_name: str, default="sample" Name of the sample dimension. feature_name: str, default="feature" Name of the feature dimension. - random_state: np.random.Generator | int | None, default=None + random_state: int | None, default=None Random seed for reproducibility. - solver_kwargs: dict - Additional keyword arguments for the SVD solver. - """ def __init__( self, - alpha: float = 0.0, - use_pca: bool = False, - n_modes: int | float | str = "all", - init_rank_reduction: float = 0.3, - compute_svd: bool = False, + alpha: float = 0, sample_name: str = "sample", feature_name: str = "feature", - random_state: np.random.Generator | int | None = None, - solver_kwargs: dict = {}, + random_state: int | None = None, ): super().__init__(sample_name, feature_name) @@ -66,12 +47,7 @@ def __init__( alpha = float(alpha) self.alpha = alpha - self.use_pca = use_pca - self.n_modes = n_modes - self.init_rank_reduction = init_rank_reduction - self.compute_svd = compute_svd self.random_state = random_state - self.solver_kwargs = solver_kwargs # Check whether Whitener is identity transformation self.is_identity = self._check_identity_transform() @@ -79,7 +55,7 @@ def __init__( def _check_identity_transform(self) -> bool: eps = np.finfo(self.alpha).eps alpha_is_one = (1.0 - self.alpha) < eps - if not self.use_pca and alpha_is_one: + if alpha_is_one: return True else: return False @@ -97,22 +73,10 @@ def _sanity_check_input(self, X) -> None: ) ) - def _get_n_modes(self, X: DataArray) -> int | float: - if isinstance(self.n_modes, str): - if self.n_modes == "all": - return min(X.shape) - else: - raise ValueError("`n_modes` must be an integer, float or 'all'") - else: - return self.n_modes - def get_serialization_attrs(self) -> dict: return dict( alpha=self.alpha, - n_modes=self.n_modes, - use_pca=self.use_pca, is_identity=self.is_identity, - s=self.s, T=self.T, Tinv=self.Tinv, feature_name=self.feature_name, @@ -132,38 +96,15 @@ def fit( if self.is_identity: self.T = xr.DataArray(1, name="identity") self.Tinv = xr.DataArray(1, name="identity") - self.s = xr.DataArray(1, name="identity") + # Compute the fractional whitening transformation directly based on covariance matrix else: - if self.use_pca: - # In case of "all" modes to the rank of the input data - self.n_modes = self._get_n_modes(X) - - svd = SVD( - n_modes=self.n_modes, - init_rank_reduction=self.init_rank_reduction, - compute=self.compute_svd, - random_state=self.random_state, - sample_name=self.sample_name, - feature_name=self.feature_name, - **self.solver_kwargs, + if n_samples < n_features: + warnings.warn( + f"The number of samples ({n_samples}) is smaller than the number of features ({n_features}), leading to an ill-conditioned problem. This may cause unstable results. Consider using PCA to reduce dimensionality and stabilize the problem by setting `use_pca=True`." ) - _, s, V = svd.fit_transform(X) - - n_c: float = np.sqrt(n_samples - 1) - self.T: DataArray = V * (s / n_c) ** (self.alpha - 1) - self.Tinv = (s / n_c) ** (1 - self.alpha) * V.conj().T - self.s = s - - # Without PCA compute the fractional whitening transformation directly based on covariance matrix - else: - if n_samples < n_features: - warnings.warn( - f"The number of samples ({n_samples}) is smaller than the number of features ({n_features}), leading to an ill-conditioned problem. This may cause unstable results. Consider using PCA to reduce dimensionality and stabilize the problem by setting `use_pca=True`." - ) - self.T, self.Tinv = self._compute_whitener_transform(X) - self.s = xr.DataArray(1, name="identity") + self.T, self.Tinv = self._compute_whitener_transform(X) return self @@ -180,36 +121,16 @@ def _compute_whitener_transform(self, X: DataArray) -> tuple[DataArray, DataArra return T, Tinv def _compute_whitener_transform_numpy(self, X): - nc = X.shape[0] - 1 + nc = X.shape[0] C = X.conj().T @ X / nc power = (self.alpha - 1) / 2 - svd_kwargs = {"random_state": self.random_state} + svd_kwargs = {"random_state": self.random_state, "solver": "full"} T = _fractional_matrix_power(C, power, **svd_kwargs) Tinv = np.linalg.inv(T) return T, Tinv - def get_Tinv(self, unwhiten_only=False) -> DataArray: - """Get the inverse transformation to unwhiten the data without PC transform. - - In contrast to `inverse_transform()`, this method returns the inverse transformation matrix without the PC transformation. That is, for PC transormed data this transformation only unwhitens the data without transforming back into the input space. For non-PC transformed data, this transformation is equivalent to the inverse transformation. - """ - if self.use_pca and unwhiten_only: - n_c = np.sqrt(self.n_samples - 1) - Tinv = (self.s / n_c) ** (1 - self.alpha) - Tinv = xr.apply_ufunc( - np.diag, - Tinv, - input_core_dims=[["mode"]], - output_core_dims=[["mode", self.feature_name]], - dask="allowed", - ) - Tinv = Tinv.assign_coords({self.feature_name: self.s.coords["mode"].data}) - return Tinv - else: - return self.Tinv - def transform(self, X: DataArray) -> DataArray: - """Transform new data into the fractional whitened PC space.""" + """Transform new data into the fractional whitened space.""" self._sanity_check_input(X) if self.is_identity: @@ -227,25 +148,22 @@ def fit_transform( ) -> DataArray: return self.fit(X, sample_dims, feature_dims).transform(X) - def inverse_transform_data(self, X: DataArray, unwhiten_only=False) -> DataArray: - """Transform 2D data (sample x feature) from whitened PC space back into original space. + def inverse_transform_data(self, X: DataArray) -> DataArray: + """Transform 2D data (sample x feature) from whitened space back into original space. Parameters ---------- X: DataArray Data to transform back into original space. - unwhiten_only: bool, default=False - If True, only unwhiten the data without transforming back into the input space. This is useful when the data was transformed with PCA before whitening and you need the unwhitened data in the PC space. """ - T_inv = self.get_Tinv(unwhiten_only=unwhiten_only) if self.is_identity: return X else: X = X.rename({self.feature_name: "mode"}) - return xr.dot(X, T_inv, dims="mode") + return xr.dot(X, self.Tinv, dims="mode") def transform_components(self, X: DataArray) -> DataArray: - """Transform 2D components (feature x mode) into whitened PC space.""" + """Transform 2D components (feature x mode) into whitened space.""" if self.is_identity: return X @@ -257,7 +175,7 @@ def transform_components(self, X: DataArray) -> DataArray: return transformed.rename({dummy_dim: self.feature_name}) def inverse_transform_components(self, X: DataArray) -> DataArray: - """Transform 2D components (feature x mode) from whitened PC space back into original space.""" + """Transform 2D components (feature x mode) from whitened space back into original space.""" if self.is_identity: return X @@ -269,11 +187,11 @@ def inverse_transform_components(self, X: DataArray) -> DataArray: return xr.dot(VS, comps_pc_space, dims=dummy_dim) def inverse_transform_scores(self, X: DataArray) -> DataArray: - """Transform 2D scores (sample x mode) from whitened PC space back into original space.""" + """Transform 2D scores (sample x mode) from whitened space back into original space.""" return X def inverse_transform_scores_unseen(self, X: DataArray) -> DataArray: - """Transform unseen 2D scores (sample x mode) from whitened PC space back into original space.""" + """Transform unseen 2D scores (sample x mode) from whitened space back into original space.""" return X diff --git a/xeofs/single/pop.py b/xeofs/single/pop.py index c783a96..6173c72 100644 --- a/xeofs/single/pop.py +++ b/xeofs/single/pop.py @@ -5,7 +5,7 @@ from typing_extensions import Self from ..linalg import total_variance -from ..preprocessing import Whitener +from ..preprocessing import PCA from ..utils.data_types import DataArray, DataObject from ..utils.xarray_utils import argsort_dask from .base_model_single_set import BaseModelSingleSet @@ -133,14 +133,13 @@ def __init__( ) self.attrs.update({"model": "Principal Oscillation Pattern analysis"}) - self.whitener = Whitener( - alpha=1.0, + self.pca = PCA( use_pca=use_pca, n_modes=n_pca_modes, init_rank_reduction=pca_init_rank_reduction, sample_name=sample_name, feature_name=feature_name, - compute_svd=compute, + compute_eagerly=compute, random_state=random_state, solver_kwargs=solver_kwargs, ) @@ -151,7 +150,7 @@ def get_serialization_attrs(self) -> dict: return dict( data=self.data, preprocessor=self.preprocessor, - whitener=self.whitener, + pca=self.pca, sorted=self.sorted, ) @@ -201,7 +200,7 @@ def _fit_algorithm(self, X: DataArray) -> Self: feature_name = self.feature_name # Transform in PC space - X = self.whitener.fit_transform(X) + X = self.pca.fit_transform(X) P, Z, lbda, T, tau = xr.apply_ufunc( self._np_solve_pop_system, @@ -235,7 +234,7 @@ def _fit_algorithm(self, X: DataArray) -> Self: idx_modes_sorted = argsort_dask(norms, "mode")[::-1] # type: ignore idx_modes_sorted.coords.update(norms.coords) - P = self.whitener.inverse_transform_components(P) + P = self.pca.inverse_transform_components(P) # Store the results self.data.add(X, "input_data", allow_compute=False) @@ -274,8 +273,8 @@ def _transform_algorithm(self, X: DataArray) -> DataArray: P = self.data["components"] # Transform into PC spcae - P = self.whitener.transform_components(P) - X = self.whitener.transform(X) + P = self.pca.transform_components(P) + X = self.pca.transform(X) # Project the data Z = xr.apply_ufunc( @@ -288,7 +287,7 @@ def _transform_algorithm(self, X: DataArray) -> DataArray: ) Z.name = "scores" - Z = self.whitener.inverse_transform_scores(Z) + Z = self.pca.inverse_transform_scores(Z) return Z @@ -312,13 +311,13 @@ def _inverse_transform_algorithm(self, scores: DataArray) -> DataArray: P = self.data["components"].sel(mode=scores.mode) # Transform in PC space - P = self.whitener.transform_components(P) + P = self.pca.transform_components(P) reconstructed_data = xr.dot(scores, P, dims="mode") reconstructed_data.name = "reconstructed_data" # Inverse transform the data into physical space - reconstructed_data = self.whitener.inverse_transform_data(reconstructed_data) + reconstructed_data = self.pca.inverse_transform_data(reconstructed_data) return reconstructed_data From 36455477ff7b8c77c0de670e753c3b4da4366f15 Mon Sep 17 00:00:00 2001 From: Niclas Rieger Date: Sat, 14 Sep 2024 15:04:40 +0200 Subject: [PATCH 2/2] fix: inverse whitening transform for singular matrix --- xeofs/preprocessing/whitener.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xeofs/preprocessing/whitener.py b/xeofs/preprocessing/whitener.py index 2e03398..a10e000 100644 --- a/xeofs/preprocessing/whitener.py +++ b/xeofs/preprocessing/whitener.py @@ -126,7 +126,10 @@ def _compute_whitener_transform_numpy(self, X): power = (self.alpha - 1) / 2 svd_kwargs = {"random_state": self.random_state, "solver": "full"} T = _fractional_matrix_power(C, power, **svd_kwargs) - Tinv = np.linalg.inv(T) + try: + Tinv = np.linalg.inv(T) + except np.linalg.LinAlgError: + Tinv = np.linalg.pinv(T) return T, Tinv def transform(self, X: DataArray) -> DataArray: